diff --git a/.clang-format b/.clang-format index 8c8ff47073a5abfa9fde8b5d8a7331e4c012a8fa..473f8407e361e561395a04d8606c51654a6a67a4 100644 --- a/.clang-format +++ b/.clang-format @@ -1,5 +1,4 @@ --- -Language: Cpp # BasedOnStyle: LLVM AccessModifierOffset: -2 AlignAfterOpenBracket: Align diff --git a/.clang-tidy b/.clang-tidy index e5ada0539c7183ca304f9d1f1ab2d4a4d7a40155..2d2d4e79dffcd494d99d07213e938203ad316b6a 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -1,7 +1,7 @@ --- Checks: '-*,readability*,-readability-magic-numbers,-readability-function-size,-readability-function-cognitive-complexity,-readability-identifier-length' WarningsAsErrors: '*' -HeaderFilterRegex: '*.h,*.hpp' +HeaderFilterRegex: '.*' FormatStyle: 'file' CheckOptions: - { key: readability-identifier-naming.ClassCase, value: lower_case } diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..d16386367f7cd7dd3c1842c484239e9e82a25efc --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +build/ \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f696240c6ce8b441b922677187039f5b1a0eed75..251f71e42d608fe13c1544e2e118a32c1161f70f 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -2,12 +2,13 @@ default: image: ubuntu:22.04 tags: - - arm64 + - arm64-shared before_script: - - apt-get update --yes - - apt-get install --yes cmake g++ gcovr + - apt update --yes + - apt install --yes cmake g++ gcovr build: + stage: build script: - mkdir build - cd build && cmake -DBUILD_TESTING=On -DARMRAL_ENABLE_COVERAGE=On .. && make -j check && gcovr -r .. @@ -15,3 +16,14 @@ build: rules: - if: $CI_PIPELINE_SOURCE == "merge_request_event" - if: $CI_PIPELINE_SOURCE == "push" + +format: + stage: build + script: + - apt install --yes git curl + - curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py + - python3 -m pip install clang-format==17.0.6 + - clang-format $(git diff --diff-filter=d --name-only origin/main | grep -e "\.[ch]$" -e "\.[ch]pp$") --Werror --dry-run + rules: + - if: $CI_PIPELINE_SOURCE == "merge_request_event" + - if: $CI_PIPELINE_SOURCE == "push" \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 85c1650d1261804644858c11312532f88ea3e3b8..6ddc959791449e79b26142e7d93aebdc6e2df65e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,82 @@ documented in this file. ### Security + +## [25.04] - 2025-04-30 + +### Added + +- Added the functions `armral_fft_create_2d_plan_cf32` and + `armral_fft_create_2d_plan_cs16`. These functions are used to create 2D FFT + plans. 2D FFT plans are executed and destroyed using the same interface as + the 1D FFT plans. + + - Added the example `fft_2d_cf32_example.c`. + +### Changed + +- Improved error correction of `armral_ldpc_decode_block` due to the use of a + new algorithm. + +- Changed the interface of `armral_ldpc_decode_block`. The new interface adds + support for rate recovery redundancy versions, specifying the frequency of + CRC checks and implicit filler bits: + + - Added an `n` parameter which specifies the number LLRs in the input. + Previous versions of ArmRAL assumed all LLRs were provided. + + - Added an `options` parameter. The `options` parameter accepts the following + options: + - `ARMRAL_LDPC_DEFAULT_OPTIONS` + - `ARMRAL_LDPC_CRC_NO` + - `ARMRAL_LDPC_CRC_16` + - `ARMRAL_LDPC_CRC_24A` + - `ARMRAL_LDPC_CRC_24B` + - `ARMRAL_LDPC_CRC_EVERY_ITER` + - `ARMRAL_LDPC_CRC_END_ITER` + - `ARMRAL_LDPC_FILLER_BITS_IMPLICIT` + - `ARMRAL_LDPC_FILLER_BITS_EXPLICIT` + + See `armral_ldpc_decode_block`'s documentation for a description of all the + available options. + + - Added `ARMRAL_FAIL` to the `armral_status` enum. + `armral_ldpc_decode_block` returns `ARMRAL_FAIL` if + `ARMRAL_LDPC_CRC_NO` is not set and the CRC check fails. + + To preserve the old behavior when upgrading ArmRAL, make the following + changes: + - For base graph 1, set `n` to `n = 66 * z`, for base graph 2, set `n` to + `50 * z` + - `crc_idx` has been replaced with `len_filler_bits` and `options`. + - If `crc_idx` was set to 0 or `ARMRAL_LDPC_CRC_NO`, set `len_filler_bits` + to 0 and set `options` to `ARMRAL_LDPC_DEFAULT_OPTIONS`. + - If `crc_idx` was set to a non-zero integer and `bg=LDPC_BASE_GRAPH_1`, + set `len_filler_bits` to `20 * z - crc_idx - 24` and set `options` to + `ARMRAL_LDPC_CRC_24B`. + - If `crc_idx` was set to a non-zero integer and `bg=LDPC_BASE_GRAPH_2`, + set `len_filler_bits` to `8 * z - crc_idx - 24` and set `options` to + `ARMRAL_LDPC_CRC_24B`. + +- The minimum supported version of CMake is 3.10. + +- The recommended version of clang-format is 17.0.6. + +- ArmRAL debug builds no longer pass `-Og` to the compiler. Debug builds + contain more debug information but are slower. + +### Removed + +- Removed `ARMRAL_LDPC_NO_CRC`. + +### Fixed + +- Fixed `armral_polar_frozen_mask` when `k = 21`, `n_pc = 3` and `n_pc_wm = 1`. + The parity check bit corresponding to the minimum row weight is selected from + the `k - n_pc` most reliable bits, rather than the `k - n_pc + n_pc_wm` most + reliable bits. See TS 38.212, section 5.3.1.2, paragraph 5. + + ## [25.01] - 2025-01-23 ### Added @@ -43,7 +119,9 @@ documented in this file. The number of batches is specified using the flag "`-b `". - FFT lengths up to 42012 are now supported, although lengths greater - than 4096 are mostly untested. + than 4096 are mostly untested. Note that some large prime plans less + than 42012 are not supported because computation involves FFTs of + length greater than 42012 using Bluestein's algorithm. ### Removed diff --git a/CMakeLists.txt b/CMakeLists.txt index 095548a00bf2664336fd4b0cc23f3df02154a86c..33835c7a0bd8d0360fbd696590718a9ad369a06a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,11 +1,5 @@ -cmake_minimum_required(VERSION 3.3) -project(armral VERSION 25.01) - -if(CMAKE_VERSION VERSION_GREATER 3.4) - # Stop CMake from automatically adding -rdynamic to linker flags because it - # causes a warning about unused compiler options when using Clang - cmake_policy(SET CMP0065 NEW) -endif() +cmake_minimum_required(VERSION 3.10) +project(armral VERSION 25.04) # Set default build type if none was specified with -DCMAKE_BUILD_TYPE=... if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) @@ -77,8 +71,11 @@ set(ARMRAL_LIB_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uun.c + ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_gu.c + ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gs.c ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_execute.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_level.cpp @@ -328,7 +325,7 @@ if(NOT ARMRAL_OVERRIDE_COMPILE_FLAGS) -fno-rtti -fno-exceptions -std=c++17> - $<$:-Og + $<$: -g3 -ggdb -fno-omit-frame-pointer>) @@ -477,6 +474,11 @@ if(BUILD_TESTING) DEPENDS bench_${BENCH_NAME}) endfunction() + if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin") + # Correlation doesn't build on macOS 15.3. + add_armral_test(correlation test/LowerPHY/Correlation/main.cpp) + endif() + add_armral_test(matrix_inv_batch test/BasicMathFun/MatrixInv/Batch/main.cpp) add_armral_test(matrix_inv_single test/BasicMathFun/MatrixInv/Single/main.cpp) add_armral_test(arm_solve @@ -529,9 +531,10 @@ if(BUILD_TESTING) test/DuRuInterface/ORanBlockScaling/Compression/main.cpp) add_armral_test(block_scaling_decompression test/DuRuInterface/ORanBlockScaling/Decompression/main.cpp) - add_armral_test(correlation test/LowerPHY/Correlation/main.cpp) add_armral_test(fft_cs16 test/LowerPHY/FFT/FFT16/main.cpp) + add_armral_test(fft_cs16_2d test/LowerPHY/FFT/FFT16_2d/main.cpp) add_armral_test(fft_cf32 test/LowerPHY/FFT/FFT32/main.cpp) + add_armral_test(fft_cf32_2d test/LowerPHY/FFT/FFT32_2d/main.cpp) add_armral_test(arm_fir_filter_cs16 test/LowerPHY/FIR/FIR16/main.cpp) add_armral_test(arm_fir_filter_cs16_decimate_2 test/LowerPHY/FIR/FIR16Decimate2/main.cpp) @@ -726,7 +729,9 @@ if(BUILD_TESTING) bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/main.cpp) add_armral_bench(correlation bench/LowerPHY/Correlation/main.cpp) add_armral_bench(fft_cs16 bench/LowerPHY/FFT/FFT16/main.cpp) + add_armral_bench(fft_cs16_2d bench/LowerPHY/FFT/FFT16_2d/main.cpp) add_armral_bench(fft_cf32 bench/LowerPHY/FFT/FFT32/main.cpp) + add_armral_bench(fft_cf32_2d bench/LowerPHY/FFT/FFT32_2d/main.cpp) add_armral_bench(arm_fir_filter_cs16 bench/LowerPHY/FIR/FIR16/main.cpp) add_armral_bench(arm_fir_filter_cs16_decimate_2 bench/LowerPHY/FIR/FIR16Decimate2/main.cpp) @@ -808,6 +813,7 @@ if(BUILD_EXAMPLES) add_armral_example(examples/block_float_9b_example.c) add_armral_example(examples/fft_cf32_example.c 10) + add_armral_example(examples/fft_2d_cf32_example.c 4 5) add_armral_example(examples/modulation_example.c) add_armral_example(examples/polar_example.cpp 128 100 35) endif() diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 646faa55a375c68820c4ba319d50cc9556c0ccdc..c6bbee412ddb9a90424273a7d7c9a86a613853eb 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -258,7 +258,7 @@ C/C++ code style is maintained through the use of `clang-format` and patch; instructions on how to run these tools are given below. `clang-format` and `clang-tidy` are part of the [LLVM -Project](https://llvm.org/). ArmRAL is tested with version 17.0.4 of +Project](https://llvm.org/). ArmRAL is tested with version 17.0.6 of the tools. Matching your coding style as close as possible to the `clang-tidy` diff --git a/CREDITS.md b/CREDITS.md index 0271d77b8ea30bc4da7ed29edf1cb040d2e08f64..b469eb4b6fdf7813b224d3d4f1b933988174b79f 100644 --- a/CREDITS.md +++ b/CREDITS.md @@ -2,6 +2,15 @@ In addition to the primary development being done by Arm, the following people and organizations have contributed to Arm RAN Acceleration Library: +- Bug report and fix to `armral_polar_frozen_mask` was contributed by Jio + Platforms Limited for 25.04. Prior to 25.04, an incorrect mask was returned + when using parameters `k = 21`, `n_pc = 3` and `n_pc_wm = 1`. + +- Work on `armral_ldpc_decode_block` to improve LDPC error correction was + contributed upstream by 4g5g Consultants. See + and + . + - Work on `armral_ldpc_rate_recovery` to correctly set the log-likelihood ratios of filler bits was contributed upstream by 4g5g Consultants. See diff --git a/Doxyfile.in b/Doxyfile.in index aace5314a8c80356dadf535167bba74d50351797..fc38b3692365fea94575ad68cfef815f0239c62d 100644 --- a/Doxyfile.in +++ b/Doxyfile.in @@ -38,7 +38,7 @@ PROJECT_NAME = "Arm RAN Acceleration Library Reference Guide" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = "25.01" +PROJECT_NUMBER = "25.04" # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/README.md b/README.md index bf005ff37ba5f2f3e1e239b0ca17a2137a53de98..56d49d826baa38be826327e8b52885231c3f6e67 100644 --- a/README.md +++ b/README.md @@ -55,35 +55,20 @@ and the `examples` directory contains the examples. **Note:** If you are building the SVE or SVE2 version of the library, you must compile with GCC 11.1.0 or newer. -2. Build ArmRAL. Navigate to the unpacked product directory and use the - following commands: +2. Build ArmRAL. Navigate to the product directory and use the following + commands: - mkdir - cd - cmake {options} -DBUILD_TESTING=On -DBUILD_EXAMPLES=On -DCMAKE_INSTALL_PREFIX= + mkdir build + cd build + cmake -DBUILD_TESTING=On -DBUILD_EXAMPLES=On .. make - Substituting: - - * `` with a build directory name. The library builds in the - specified directory. - * `{options}` with the CMake options to use to build the library. - * (Optional) `` with an installation directory name. When you - install ArmRAL (see **Install ArmRAL**), the library installs to the - specified directory. If `` is not specified, the default is - `/usr/local`. - * `` with the path to the root directory of the library source. - Notes: * The `-DBUILD_TESTING=On` and `-DBUILD_EXAMPLES=On` options are required if you want to run the library tests and benchmarks (`-DBUILD_TESTING`) and examples (`-DBUILD_EXAMPLES`). - * The `-DCMAKE_INSTALL_PREFIX=` option specifies the base - directory used to install the library. The library archive is installed to - `/lib` and headers are installed to `/include`. - The default location is `/usr/local`. * By default, a static library is built. To build a dynamic or a static library use the `-DBUILD_SHARED_LIBS={On|Off}` option. @@ -92,7 +77,15 @@ and the `examples` directory contains the examples. optimized library to build (Neon, SVE, or SVE2), use the `-DARMRAL_ARCH={NEON|SVE|SVE2}` option. - Other common CMake `{options}` include: + Common CMake options include: + + * `-DCMAKE_INSTALL_PREFIX=` + + Specifies the base directory used to install the library. The library + archive is installed to `/lib` and headers are installed to + `/include`. + + Default is `/usr/local`. * `-DCMAKE_BUILD_TYPE={Debug|Release}` @@ -397,7 +390,7 @@ file. The Arm RAN Acceleration Library Reference Guide is available online at: - https://developer.arm.com/documentation/102249/2501 + https://developer.arm.com/documentation/102249/2504 If you have Doxygen installed on your system, you can build a local HTML version of the ArmRAL documentation using CMake. diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 27b7cfe260b92cb898420543a1ed2b2d0728ce38..cc940e03de9ddbf9c12d93753fbf82861e92fd69 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,4 +1,4 @@ -# Arm RAN Acceleration Library 25.01 Release Notes +# Arm RAN Acceleration Library 25.04 Release Note Non-Confidential Copyright © 2020-2025 Arm Limited (or its affiliates). All rights reserved. @@ -9,7 +9,7 @@ this document. ## Contents -These Release Notes contain the following sections: +This Release Note contains: - Release overview - Release contents @@ -20,7 +20,7 @@ These Release Notes contain the following sections: ## Release overview -This section describes the product to which these release notes relate and +This section describes the product to which this release note relates and provides information about its license. ### Product description @@ -43,16 +43,16 @@ ArmRAL includes functions that operate on 16-bit signed integers and 16-bit and ### Release status -This is the 25.01 release of ArmRAL. +This is the 25.04 release of ArmRAL. ### Licensing information Use of ArmRAL is subject to a BSD-3-Clause license. See the `LICENSE.md` file -in your product installation for the license text. We will receive inbound +in your product installation for the license text. We receive inbound contributions under the same license. -If you require a different license than BSD-3-Clause for compatibility with your -end product, please get in contact via including +If you require a different license than BSD-3-Clause for compatibility with +your end product, get in contact via including "[ArmRAL]" in the email subject line. ## Release contents @@ -61,7 +61,7 @@ ArmRAL releases contain documentation and source files. This section describes: -- Cloning the product's git repository from Arm's GitLab +- Cloning the product's Git repository from Arm's GitLab - The contents of this release - The changes since the previous release - Any known issues and limitations that exist at the time of this release @@ -71,28 +71,30 @@ This section describes: ArmRAL is available on [Arm's GitLab website](https://gitlab.arm.com/networking/ral). -**To access this release, clone the following repository using HTTPS:** +To access this release, clone the repository using HTTPS: - git clone -b armral-25.01 https://git.gitlab.arm.com/networking/ral +``` +git clone -b armral-25.04 https://git.gitlab.arm.com/networking/ral +``` ### Deliverables -The downloaded product includes the following deliverables: +The downloaded product includes the deliverables: -- ArmRAL 25.01 -- Release Notes (this document) +- ArmRAL 25.04 +- Release Note (this document) - Documentation Product documentation is available on the -[Arm Developer website](https://developer.arm.com/documentation/102249/2501). +[Arm Developer website](https://developer.arm.com/documentation/102249/2504). -**Note:** Documentation, errata and release notes might change between product +**Note:** Documentation, errata and release note might change between product releases. For the latest documentation bundle, check the product download page. ### Differences from previous release -The following sections describe differences from the previous release of +This section describes differences from the previous release of ArmRAL. #### Additions and functionality changes @@ -100,51 +102,47 @@ ArmRAL. This section describes new features or any technical changes to features or components in this release. -- The functions `armral_turbo_decode_batch`, and - `armral_turbo_decode_batch_noalloc` have been added. These functions implement - a maximum a posteriori (MAP) algorithm to decode the output of the LTE Turbo - encoding scheme on a batch of encoded data. +- `armral_ldpc_decode_block` has significantly improved error correction. -- The function `armral_turbo_decode_batch_noalloc_buffer_size` has been added, - which returns the size of buffer required for - `armral_turbo_decode_batch_noalloc`. +- The interface to `armral_ldpc_decode_block` has changed to support redundancy + versions from rate recovery and implicit filler bits. Previous versions of + ArmRAL assumed a full input of LLRs. To support different redundancy + versions, `armral_ldpc_decode_block` now additionally accepts the length of + the LLR. Details of how to update to the new interface can be found in + CHANGELOG.md. -- FFT lengths up to 42012 are now supported, although lengths greater - than 4096 are mostly untested. +- The functions `armral_fft_create_2d_plan_cf32` and + `armral_fft_create_2d_plan_cs16` have been added. These can be used to create + 2D FFT plans. 2D FFT plans can be executed and destroyed using the same + functions as the 1D FFT plans. -- Unused FFT kernels have been removed. + - The example `fft_2d_cf32_example.c` has been added. #### Performance improvements This section describes any features or components with improved performance. -- Neon and SVE performance improvements for the following routines: - - - `armral_fft_execute_cf32` and `armral_fft_execute_cs16`. +- The performance of `armral_ldpc_decode_block` has degraded due to the use of + a different algorithm. #### Changes to simulation programs This section describes any changes, new features or components added to the channel simulation programs in this release. -- The LTE Turbo coding Additive White Gaussian Noise (AWGN) simulation now - supports the decoding of batches of data, using `armral_turbo_decode_batch`. - The number of batches is specified using the flag "`-b `". +- `ldpc_awgn` has been updated to use the new interface of + `armral_ldpc_decode_block`. It also now only reports the bit error rates and + block error rates of the message bits; parity bits are ignored. #### Resolved issues This section describes any known issues resolved in the current release. -- Improved error correction of LDPC decoding (`armral_ldpc_decode_block`) in - the presence of channel noise. The function now uses 16-bit signed integers - internally rather than 8-bit signed integers. This may result in decreased - performance. - -- The arguments to the function `armral_turbo_decode_block_noalloc_buffer_size` - have been changed to remove the unused second argument, `max_iter`. - -- When planning FFTs with an unsupported length, `armral_fft_create_plan_cf32` - and `armral_fft_create_plan_cs16` now return `ARMRAL_ARGUMENT_ERROR`. +- `armral_polar_frozen_mask` now returns the correct frozen mask for `k = 21`, + `n_pc = 3` and `n_pc_wm = 1`. The parity check bit corresponding to the + minimum row weight is now selected from the `k - n_pc` most reliable bits, + rather than the `k - n_pc + n_pc_wm` most reliable bits. See TS 38.212, + section 5.3.1.2, paragraph 5. ### Known limitations @@ -156,11 +154,10 @@ This section describes any known limitations of the current release. If you have any issues with the installation, content, or use of this release, raise an issue on [Arm's GitLab website](https://gitlab.arm.com/networking/ral/-/issues). -Arm will respond as soon as possible. ### Tools -To build or run ArmRAL you will need: +To build or run ArmRAL you need: - A C/C++ compiler, such as GCC. ArmRAL has been tested with GCC 7.5.0, 8.5.0, 9.5.0, 10.5.0, 11.5.0, 12.4.0, 13.3.0, and 14.2.0. @@ -170,7 +167,7 @@ To build or run ArmRAL you will need: GCC compiler on the [Arm Developer website](https://developer.arm.com/tools-and-software/open-source-software/developer-tools/gnu-toolchain/gnu-a/downloads). The variant to use for an AArch64 GNU/Linux target is `aarch64-none-linux-gnu`. -- CMake version 3.3.0 or higher. +- CMake version 3.10 or higher. Additionally: @@ -186,7 +183,7 @@ Additionally: **Note:** ArmRAL runs on AArch64 cores, however to use the convolutional encoder, CRC, and sequence generator functions you must run on a core that supports the AArch64 PMULL extension. If the system you are using supports the -PMULL extension, `pmull` will be included in the "Features" list in the +PMULL extension, `pmull` is included in the "Features" list in the `/proc/cpuinfo` file. ## Release history @@ -195,7 +192,7 @@ ArmRAL's release history is available on the [Arm Developer website](https://dev ## Conventions -The following sections describe conventions used in Arm documents. +This section describes conventions used in Arm documents. ### Glossary diff --git a/bench/LowerPHY/FFT/FFT16_2d/bench.py b/bench/LowerPHY/FFT/FFT16_2d/bench.py new file mode 100755 index 0000000000000000000000000000000000000000..b16fdc74f7816a03e5c65b8cf3c5f11958be8b58 --- /dev/null +++ b/bench/LowerPHY/FFT/FFT16_2d/bench.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +# Arm RAN Acceleration Library +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause + +import json +from pathlib import Path +import os + + +def get_path(x): return x if Path(x).is_file() else os.path.join("armral", x) + + +exe_name = get_path("bench_fft_cs16_2d") + +reps_base = 10000000 +nArr = [ + (8, 8), (16, 16), (32, 32), (64, 64), + (128, 128), (256, 256), + (16, 64), (64, 16), (128, 256), (256, 128), + (32, 512), (512, 32), (288, 288) +] + +j = { + "exe_name": exe_name, + "cases": [] +} + +for n0, n1 in nArr: + reps = reps_base // (n0 * n1) + + case_fwd = { + "name": "fft_cs16_2d_fwd_{}x{}".format(n0, n1), + "args": "{} {} -1".format(n0, n1), + "reps": reps + } + j["cases"].append(case_fwd) + + case_bwk = { + "name": "fft_cs16_2d_bwk_{}x{}".format(n0, n1), + "args": "{} {} 1".format(n0, n1), + "reps": reps + } + j["cases"].append(case_bwk) + +print(json.dumps(j)) diff --git a/bench/LowerPHY/FFT/FFT16_2d/main.cpp b/bench/LowerPHY/FFT/FFT16_2d/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..698daa60e1b5c71794244bcfda8db2ba13201d82 --- /dev/null +++ b/bench/LowerPHY/FFT/FFT16_2d/main.cpp @@ -0,0 +1,58 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#include "armral.h" + +#include +#include +#include +#include + +namespace { + +void run_fft_i16_2d_perf(uint32_t n0, uint32_t n1, armral_fft_direction_t dir, + uint32_t num_reps) { + const char *dir_name = dir < 0 ? "forwards" : "backwards"; + printf("[FFT i16] - n0 = %u, n1 = %u, direction = %s, number of iterations " + "= %u\n", + n0, n1, dir_name, num_reps); + + const std::vector x(n0 * n1); + std::vector y(n0 * n1); + + armral_fft_plan_t *p; + armral_fft_create_2d_plan_cs16(&p, n0, n1, dir); + assert(p); + const auto *x_ptr = x.data(); + auto *y_ptr = y.data(); + + for (uint32_t i = 0; i < num_reps; ++i) { + armral_fft_execute_cs16(p, x_ptr, y_ptr); + } + + armral_fft_destroy_plan_cs16(&p); +} + +} // anonymous namespace + +int main(int argc, char **argv) { + if (argc != 5) { + // n0 - Length of first dimension + // n1 - Length of second dimension + // dir - The direction + // nreps - The number of times to repeat the function + fprintf(stderr, "usage: %s n0 n1 dir nreps\n", argv[0]); + exit(EXIT_FAILURE); + } + auto n0 = (uint32_t)atoi(argv[1]); + auto n1 = (uint32_t)atoi(argv[2]); + auto dir = (armral_fft_direction_t)atoi(argv[3]); + auto num_reps = (uint32_t)atoi(argv[4]); + + run_fft_i16_2d_perf(n0, n1, dir, num_reps); + + return EXIT_SUCCESS; +} diff --git a/bench/LowerPHY/FFT/FFT32_2d/bench.py b/bench/LowerPHY/FFT/FFT32_2d/bench.py new file mode 100755 index 0000000000000000000000000000000000000000..801d1d0419187e51f10a5c56d46d07e3c17bdd3b --- /dev/null +++ b/bench/LowerPHY/FFT/FFT32_2d/bench.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +# Arm RAN Acceleration Library +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause + +import json +from pathlib import Path +import os + + +def get_path(x): return x if Path(x).is_file() else os.path.join("armral", x) + + +exe_name = get_path("bench_fft_cf32_2d") + +reps_base = 10000000 +nArr = [ + (8, 8), (16, 16), (32, 32), (64, 64), + (128, 128), (256, 256), + (16, 64), (64, 16), (128, 256), (256, 128), + (32, 512), (512, 32), (288, 288) +] + +j = { + "exe_name": exe_name, + "cases": [] +} + +for n0, n1 in nArr: + reps = reps_base // (n0 * n1) + + case_fwd = { + "name": "fft_cf32_2d_fwd_{}x{}".format(n0, n1), + "args": "{} {} -1".format(n0, n1), + "reps": reps + } + j["cases"].append(case_fwd) + + case_bwk = { + "name": "fft_cf32_2d_bwk_{}x{}".format(n0, n1), + "args": "{} {} 1".format(n0, n1), + "reps": reps + } + j["cases"].append(case_bwk) + +print(json.dumps(j)) diff --git a/bench/LowerPHY/FFT/FFT32_2d/main.cpp b/bench/LowerPHY/FFT/FFT32_2d/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d5169720fedd61c3c10dfe6f1a89c9614ece2ff3 --- /dev/null +++ b/bench/LowerPHY/FFT/FFT32_2d/main.cpp @@ -0,0 +1,58 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#include "armral.h" + +#include +#include +#include +#include + +namespace { + +void run_fft_f32_2d_perf(uint32_t n0, uint32_t n1, armral_fft_direction_t dir, + uint32_t num_reps) { + const char *dir_name = dir < 0 ? "forwards" : "backwards"; + printf("[FFT f32] - n0 = %u, n1 = %u, direction = %s, number of iterations " + "= %u\n", + n0, n1, dir_name, num_reps); + + const std::vector x(n0 * n1); + std::vector y(n0 * n1); + + armral_fft_plan_t *p; + armral_fft_create_2d_plan_cf32(&p, n0, n1, dir); + assert(p); + const auto *x_ptr = x.data(); + auto *y_ptr = y.data(); + + for (uint32_t i = 0; i < num_reps; ++i) { + armral_fft_execute_cf32(p, x_ptr, y_ptr); + } + + armral_fft_destroy_plan_cf32(&p); +} + +} // anonymous namespace + +int main(int argc, char **argv) { + if (argc != 5) { + // n0 - Length of first dimension + // n1 - Length of second dimension + // dir - The direction + // nreps - The number of times to repeat the function + fprintf(stderr, "usage: %s n0 n1 dir nreps\n", argv[0]); + exit(EXIT_FAILURE); + } + auto n0 = (uint32_t)atoi(argv[1]); + auto n1 = (uint32_t)atoi(argv[2]); + auto dir = (armral_fft_direction_t)atoi(argv[3]); + auto num_reps = (uint32_t)atoi(argv[4]); + + run_fft_f32_2d_perf(n0, n1, dir, num_reps); + + return EXIT_SUCCESS; +} diff --git a/bench/UpperPHY/LDPC/Decoding/main.cpp b/bench/UpperPHY/LDPC/Decoding/main.cpp index 6a04de3e7d0c21c51b10051b3089a19165644e33..16486d73dd9338814d6fe6bc920ad4c1909b7601 100755 --- a/bench/UpperPHY/LDPC/Decoding/main.cpp +++ b/bench/UpperPHY/LDPC/Decoding/main.cpp @@ -14,8 +14,7 @@ namespace { void run_ldpc_decoding_perf(uint32_t n, armral_ldpc_graph_t bg, uint32_t z, uint32_t len_filler_bits, uint32_t num_its, - armral_ldpc_decode_options_t options, - uint32_t num_reps) { + uint32_t options, uint32_t num_reps) { printf("[LDPC DECODING] - base graph = %u, lifting size = %u, number of " "decoding " "iterations = %u, with length of filler bits = %u , number of " @@ -75,10 +74,7 @@ int main(int argc, char **argv) { auto nmessage_bits = bg == LDPC_BASE_GRAPH_1 ? z * 22 : z * 10; auto len_filler_bits = (crc_idx != 0U) ? nmessage_bits - crc_idx - 24 : 0; auto num_its = (uint32_t)atoi(argv[4]); - auto options = - (armral_ldpc_decode_options_t)((crc_idx != 0U) - ? ARMRAL_LDPC_DEFAULT_OPTIONS - : ARMRAL_LDPC_CRC_NO); + auto options = crc_idx != 0U ? ARMRAL_LDPC_CRC_24B : ARMRAL_LDPC_CRC_NO; auto num_reps = (uint32_t)atoi(argv[5]); run_ldpc_decoding_perf(n, bg, z, len_filler_bits, num_its, options, num_reps); diff --git a/docs/examples.md b/docs/examples.md index b3f0d76b904a316c2496fb55620c5878f07879ac..e5158aa767eca2b1bb15eb33e9d4509465726f76 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -17,7 +17,7 @@ Acceleration Library (ArmRAL). To build the library, use: - git clone -b armral-25.01 https://git.gitlab.arm.com/networking/ral.git + git clone -b armral-25.04 https://git.gitlab.arm.com/networking/ral.git mkdir ral/build cd ral/build cmake .. diff --git a/docs/frontmatter.md b/docs/frontmatter.md index 05ebc17afa9636479e16cf6705df37bca3715160..17d269089de363298f035f47e851df3b3a27b215 100644 --- a/docs/frontmatter.md +++ b/docs/frontmatter.md @@ -24,7 +24,7 @@ supplier and give: If you have any comments on content, send an e-mail to errata@arm.com. Give: * The title Arm RAN Acceleration Library Reference Guide. -* The number 102249_2501_00_en. +* The number 102249_2504_00_en. * If applicable, the relevant page number(s) to which your comments refer. * A concise explanation of your comments. @@ -138,3 +138,4 @@ Issue | Date | Confidentiality | Change 2407-00 | 18 July 2024 | Non-Confidential | Update for Arm RAN Acceleration Library v24.07 2410-00 | 17 October 2024 | Non-Confidential | Update for Arm RAN Acceleration Library v24.10 2501-00 | 23 January 2025 | Non-Confidential | Update for Arm RAN Acceleration Library v25.01 +2504-00 | 17 April 2025 | Non-Confidential | Update for Arm RAN Acceleration Library v25.04 diff --git a/examples/fft_2d_cf32_example.c b/examples/fft_2d_cf32_example.c new file mode 100644 index 0000000000000000000000000000000000000000..2ce40559bbd1cdcf97f3c5f0d3309f1234c6b25b --- /dev/null +++ b/examples/fft_2d_cf32_example.c @@ -0,0 +1,90 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#include "armral.h" + +#include +#include + +// This function shows how to create a plan and execute a 2-d FFT using the +// ArmRAL library +static void example_fft_plan_and_execute(int n0, int n1) { + armral_fft_plan_t *p; + printf("Planning 2-d FFT of dimensions %d by %d\n", n0, n1); + // In the planning, the direction of the FFT is indicated by the last + // parameter, which is either -1 (for forwards) or 1 (for backwards) + armral_fft_create_2d_plan_cf32(&p, n0, n1, -1); + + // Number of elements in the transform + int nn = n0 * n1; + + // Create the data that is to be used in FFTs. The input array (x) needs to + // be initialized. The output array (y) does not. + armral_cmplx_f32_t *x = + (armral_cmplx_f32_t *)malloc(nn * sizeof(armral_cmplx_f32_t)); + armral_cmplx_f32_t *y = + (armral_cmplx_f32_t *)malloc(nn * sizeof(armral_cmplx_f32_t)); + for (int i = 0; i < nn; ++i) { + x[i] = (armral_cmplx_f32_t){(float)i, (float)-i}; + y[i] = (armral_cmplx_f32_t){0.F, 0.F}; + } + + printf("Input Data:\n"); + for (int i = 0; i < n0; ++i) { + for (int j = 0; j < n1; ++j) { + printf("(%f + %fi) ", x[n1 * i + j].re, x[n1 * i + j].im); + } + printf("\n"); + } + printf("\n"); + + // The FFTs are executed with different input and output data. The length + // of the input and output arrays needs to be at least the same as that of + // the length parameter with which the plan was created. No checks are + // performed that this is the case in the library. + printf("Performing 2-d FFT of dimensions %d by %d\n", n0, n1); + armral_fft_execute_cf32(p, x, y); + + // A plan can be re-used to solve other FFTs, but once a plan is no longer + // needed, it needs to be destroyed to avoid leaking memory. + printf("Destroying plan for 2-d FFT of dimensions %d by %d\n", n0, n1); + armral_fft_destroy_plan_cf32(&p); + + printf("Result:\n"); + for (int i = 0; i < n0; ++i) { + for (int j = 0; j < n1; ++j) { + printf("(%f + %fi) ", y[n1 * i + j].re, y[n1 * i + j].im); + } + printf("\n"); + } + printf("\n"); + + // Need to free the pointers to data. These are not owned by the FFT plan, + // and it is the user's responsibility to manage the memory. + free(x); + free(y); +} + +int main(int argc, char **argv) { + if (argc < 3) { + printf("Usage: %s dim0 dim1\n", argv[0]); + exit(EXIT_FAILURE); + } + + int n0 = atoi(argv[1]); + if (n0 < 1) { + printf("Length dim0 parameter must be positive and non-zero\n"); + exit(EXIT_FAILURE); + } + + int n1 = atoi(argv[2]); + if (n1 < 1) { + printf("Length dim1 parameter must be positive and non-zero\n"); + exit(EXIT_FAILURE); + } + + example_fft_plan_and_execute(n0, n1); +} diff --git a/include/armral.h b/include/armral.h index 057087a99b692aaba9bb8c909a98795b5ca689df..70d795fe65996f2e435f9fefb699d486ea0ede7e 100644 --- a/include/armral.h +++ b/include/armral.h @@ -106,8 +106,8 @@ extern "C" { */ typedef enum { ARMRAL_SUCCESS = 0, ///< No error. - ARMRAL_ARGUMENT_ERROR = -1, ///< One or more arguments are incorrect - ARMRAL_RESULT_FAIL = -2 ///< Result failed. + ARMRAL_ARGUMENT_ERROR = -1, ///< One or more arguments are incorrect. + ARMRAL_FAIL = -2, ///< A failure has been detected. } armral_status; /** @@ -749,9 +749,9 @@ armral_status armral_cmplx_mat_vec_mult_batch_i16_pa( * + Matrix and vector elements are complex int16 in Q15 format. * + Matrices are stored in memory in row-major order. * - * A 32-bit Q0.31 saturating accumulator is used internally. If you need a - * larger range, consider using \link armral_cmplx_mat_vec_mult_i16 \endlink - * instead. To get a Q15 result, the final result is narrowed to 16 bits with + * A 32-bit Q31 saturating accumulator is used internally. If you need a larger + * range, consider using \link armral_cmplx_mat_vec_mult_i16 \endlink instead. + * To get a Q15 result, the final result is narrowed to 16 bits with * saturation. * * @param[in] m The number of rows in matrix `A` and the length of @@ -786,10 +786,10 @@ armral_status armral_cmplx_mat_vec_mult_i16_32bit( * The same layout is used for vector elements, except that the offset to the * next vector element is `num_mats * num_vecs_per_mat`. * - * A 32-bit Q0.31 saturating accumulator is used internally. If you need a - * larger range, consider using \link armral_cmplx_mat_vec_mult_batch_i16 - * \endlink instead. To get a Q15 result, the final result is narrowed to 16 - * bits with saturation. + * A 32-bit Q31 saturating accumulator is used internally. If you need a larger + * range, consider using \link armral_cmplx_mat_vec_mult_batch_i16 \endlink + * instead. To get a Q15 result, the final result is narrowed to 16 bits with + * saturation. * * @param[in] num_mats The number of input matrices. * @param[in] num_vecs_per_mat The number of input and output vectors @@ -839,10 +839,10 @@ armral_status armral_cmplx_mat_vec_mult_batch_i16_32bit( * * representing an identical format to the input matrices. * - * A 32-bit Q0.31 saturating accumulator is used internally. If you need a - * larger range, consider using \link armral_cmplx_mat_vec_mult_batch_i16_pa - * \endlink instead. To get a Q15 result, the final result is narrowed to 16 - * bits with saturation. + * A 32-bit Q31 saturating accumulator is used internally. If you need a larger + * range, consider using \link armral_cmplx_mat_vec_mult_batch_i16_pa \endlink + * instead. To get a Q15 result, the final result is narrowed to 16 bits with + * saturation. * * @param[in] num_mats The number of input matrices. * @param[in] num_vecs_per_mat The number of input and output vectors @@ -1054,10 +1054,9 @@ armral_cmplx_matmul_i16_noalloc(uint16_t m, uint16_t n, uint16_t k, * + Matrix elements are complex int16 in Q15 format. * + Matrices are stored in memory in row-major order. * - * A 32-bit Q0.31 saturating accumulator is used internally. If you need a - * larger range, consider using \link armral_cmplx_matmul_i16 \endlink - * instead. To get a Q15 result, the final result is narrowed to 16 bits with - * saturation. + * A 32-bit Q31 saturating accumulator is used internally. If you need a larger + * range, consider using \link armral_cmplx_matmul_i16 \endlink instead. To get + * a Q15 result, the final result is narrowed to 16 bits with saturation. * * @param[in] m The number of rows (`M`) in matrices `A` and `C`. * @param[in] n The number of columns (`N`) in matrices `B` and @@ -1411,7 +1410,7 @@ armral_solve_2x4_f32(uint32_t num_sub_carrier, uint32_t num_sc_per_g, * number of antennae and `x̂` is the estimate of the transmitted signal, size * corresponds to the number of layers. * - * The input values for y are given in the Q0.15 fixed-point format. Each + * The input values for y are given in the Q15 fixed-point format. Each * component of the vector may have a different number of fractional bits. The * number of fractional bits per `y` component is passed in an array of the * same length as `y`. @@ -2832,8 +2831,7 @@ armral_status armral_polar_frozen_mask(uint32_t n, uint32_t e, uint32_t k, * of the 3GPP Technical Specification (TS) 38.212, the function interleaves the * supplied input bit array `c` into a larger output bit array. `c` interleaves * into positions where the `frozen` mask indicates an information bit is - * present. Interleaving is performed as specified in section 5.3.1.1 of 3GPP TS - * 38.212. + * present. * * For a particular underlying Polar code of length `N` bits (`N` must be a * power of two between 32 and 1024 inclusive), the `frozen` mask must be an @@ -3180,10 +3178,17 @@ uint32_t armral_polar_crc_check_noalloc_buffer_size(uint32_t k); * \brief Computes the Discrete Fourier Transform (DFT) of a sequence (forwards * transform), or the inverse (backwards transform). * - * FFT plans are represented by an opaque structure. To fill the plan - * structure, define a pointer to the structure and call \link - * armral_fft_create_plan_cf32 \endlink or \link armral_fft_create_plan_cs16 - * \endlink. For example: + * FFT plans are represented by an opaque structure. To fill the plan structure, + * define a pointer to the structure and call one of + * + * - armral_fft_create_plan_T + * - armral_fft_create_2d_plan_T + * + * where T is the data type of the input and output arrays. T can be one of + * cf32 or cs16. cf32 means complex single precision floating point. cs16 means + * complex fixed point in Q15 format. + * + * For example: *
  * armral_fft_plan_t *plan;
  * armral_fft_create_plan_cf32(&plan, 32, ARMRAL_FFT_FORWARDS);
@@ -3191,17 +3196,15 @@ uint32_t armral_polar_crc_check_noalloc_buffer_size(uint32_t k);
  * armral_fft_destroy_plan_cf32(&plan);
  * 
*/ + /** * The opaque structure to an FFT plan. You must fill an FFT plan before you - * use it. To fill an FFT plan, call \link armral_fft_create_plan_cf32 \endlink - * or \link armral_fft_create_plan_cs16 \endlink. + * use it. */ typedef struct armral_fft_plan_t armral_fft_plan_t; /** - * The direction of the FFT being computed. The direction is passed to \link - * armral_fft_create_plan_cf32 \endlink and \link armral_fft_create_plan_cs16 - * \endlink. + * The direction of the FFT being computed. */ typedef enum { ARMRAL_FFT_FORWARDS = -1, ///< Compute a forwards (non-inverse) FFT. @@ -3209,7 +3212,7 @@ typedef enum { } armral_fft_direction_t; /** - * @brief Creates a plan to solve a complex fp32 FFT. + * @brief Creates a 1-dimensional fp32 FFT plan for complex data. * * Fills the passed pointer with a pointer to the plan that is created. The plan * that is created can then be used to solve problems with specified size and @@ -3222,31 +3225,58 @@ typedef enum { * * \note This function supports input sizes `n <= 42012`. * - * @param[in,out] p A pointer to the resulting plan pointer. On output `*p` is - * a valid pointer, to be passed to - * \link armral_fft_execute_cf32 \endlink. - * @param[in] n The problem size to be solved by this FFT plan. - * @param[out] dir The direction to be solved by this FFT plan. + * @param[out] p A pointer to the resulting plan pointer. On output `*p` is a + * valid pointer, to be passed to \link armral_fft_execute_cf32 + * \endlink. + * @param[in] n The problem size to be solved by this FFT plan. + * @param[in] dir The direction to be solved by this FFT plan. Either + * `ARMRAL_FFT_FORWARDS` or `ARMRAL_FFT_BACKWARDS`. * @return An `armral_status` value that indicates success or failure */ armral_status armral_fft_create_plan_cf32(armral_fft_plan_t **p, int n, armral_fft_direction_t dir); /** - * @brief Performs a single FFT using the specified plan and arrays. + * @brief Creates a 2-dimensional fp32 FFT plan for complex data. * - * Uses the plan created by \link armral_fft_create_plan_cf32 \endlink to - * perform the configured FFT with the arrays that are specified. + * Fills the passed pointer with a pointer to the plan that is created. The + * plan can then be used to solve 2-d problems with specified dimensions and + * direction. It is efficient to create plans once and reuse them, rather than + * creating a plan for every execute call. For some inputs, creating FFT plans + * can incur a significant overhead. * - * @param[in] p A pointer to the FFT plan. The pointer is the value that is - * filled in by an earlier call to - * \link armral_fft_create_plan_cf32 \endlink. - * @param[in] x The input array for this FFT. The length must be the same as - * the value of `n` that was previously passed to - * \link armral_fft_create_plan_cf32 \endlink. - * @param[out] y The output array for this FFT. The length must be the same as - * the value of `n` that was previously passed to - * \link armral_fft_create_plan_cf32 \endlink. + * The input and output arrays passed to \ref armral_fft_execute_cf32 should be + * in row-major format, i.e. the dimension of length n1 is contiguous, and the + * dimension of length n0 has a stride between successive elements of n1. + * + * To avoid memory leaks, call \link armral_fft_destroy_plan_cf32 \endlink when + * the plan is no longer needed. + * + * @param[out] p A pointer to the resulting plan pointer. On output `*p` is a + * valid pointer, to be passed to \link armral_fft_execute_cf32 + * \endlink. + * @param[in] n0 The size of the first dimension. + * @param[in] n1 The size of the second dimension. + * @param[in] dir The direction to be solved by this FFT plan. Either + * `ARMRAL_FFT_FORWARDS` or `ARMRAL_FFT_BACKWARDS`. + * @return An `armral_status` value that indicates success or failure + */ +armral_status armral_fft_create_2d_plan_cf32(armral_fft_plan_t **p, int n0, + int n1, + armral_fft_direction_t dir); + +/** + * @brief Performs a single FFT using the specified plan and arrays. + * + * Uses the plan created by \ref armral_fft_create_plan_cf32 or \ref + * armral_fft_create_2d_plan_cf32 to perform the configured FFT with the arrays + * that are specified. + * + * @param[in] p A pointer to the FFT plan. + * @param[in] x The input array for this FFT. The array must contain `n` + * elements for 1-d plans, and `n0 * n1` for 2-d plans. + * @param[out] y The output array for this FFT. The array must contain `n` + * elements for 1-d plans, and `n0 * n1` for 2-d plans. * @return An `armral_status` value that indicates success or failure. */ armral_status armral_fft_execute_cf32(const armral_fft_plan_t *p, @@ -3254,22 +3284,20 @@ armral_status armral_fft_execute_cf32(const armral_fft_plan_t *p, armral_cmplx_f32_t *y); /** - * @brief Destroys an FFT plan. + * @brief Destroys an FFT plan. * * The \link armral_fft_execute_cf32 \endlink function frees any associated * memory, and sets `*p = NULL`, for a plan that was previously created by - * \link armral_fft_create_plan_cf32 \endlink. + * \ref armral_fft_create_plan_cf32 or \ref armral_fft_create_2d_plan_cf32. * - * @param[in,out] p A pointer to the FFT plan pointer. The pointer must be the - * value that is returned by an earlier call to - * \link armral_fft_create_plan_cf32 \endlink. On function - * exit, the value that is pointed to is set to `NULL`. + * @param[in,out] p A pointer to the FFT plan pointer. On function exit, `*p` + * is set to `NULL`. * @return An `armral_status` value that indicates success or failure. */ armral_status armral_fft_destroy_plan_cf32(armral_fft_plan_t **p); /** - * @brief Creates a plan to solve a complex int16 (Q0.15 format) FFT. + * @brief Creates a 1-dimensional int16 (Q15 format) FFT plan for complex data. * * Fills the passed pointer with a pointer to the plan that is created. The plan * that is created can then be used to solve problems with specified size and @@ -3282,31 +3310,58 @@ armral_status armral_fft_destroy_plan_cf32(armral_fft_plan_t **p); * * \note This function supports input sizes `n <= 42012`. * - * @param[in,out] p A pointer to the resulting plan pointer. On output `*p` is - * a valid pointer, to be passed to - * \link armral_fft_execute_cs16 \endlink. - * @param[in] n The problem size to be solved by this FFT plan. - * @param[out] dir The direction to be solved by this FFT plan. + * @param[out] p A pointer to the resulting plan pointer. On output `*p` is a + * valid pointer, to be passed to \link armral_fft_execute_cs16 + * \endlink. + * @param[in] n The problem size to be solved by this FFT plan. + * @param[in] dir The direction to be solved by this FFT plan. Either + * `ARMRAL_FFT_FORWARDS` or `ARMRAL_FFT_BACKWARDS`. * @return An `armral_status` value that indicates success or failure. */ armral_status armral_fft_create_plan_cs16(armral_fft_plan_t **p, int n, armral_fft_direction_t dir); /** - * @brief Performs a single FFT using the specified plan and arrays. + * @brief Creates a 2-dimensional int16 (Q15 format) FFT plan for complex data. + * + * Fills the passed pointer with a pointer to the plan that is created. The + * plan can then be used to solve 2-d problems with specified dimensions and + * direction. It is efficient to create plans once and reuse them, rather than + * creating a plan for every execute call. For some inputs, creating FFT plans + * can incur a significant overhead. + * + * The input and output arrays passed to \ref armral_fft_execute_cs16 should be + * in row-major format, i.e. the dimension of length n1 is contiguous, and the + * dimension of length n0 has a stride between successive elements of n1. + * + * To avoid memory leaks, call \link armral_fft_destroy_plan_cs16 \endlink when + * you no longer need this plan. + * + * @param[out] p A pointer to the resulting plan pointer. On output `*p` is a + * valid pointer, to be passed to \link armral_fft_execute_cf32 + * \endlink. + * @param[in] n0 The size of the first dimension. + * @param[in] n1 The size of the second dimension. + * @param[in] dir The direction to be solved by this FFT plan. Either + * `ARMRAL_FFT_FORWARDS` or `ARMRAL_FFT_BACKWARDS`. + * @return An `armral_status` value that indicates success or failure + */ +armral_status armral_fft_create_2d_plan_cs16(armral_fft_plan_t **p, int n0, + int n1, + armral_fft_direction_t dir); + +/** + * @brief Performs a single FFT using the specified plan and arrays. * - * Uses the plan created by \link armral_fft_create_plan_cs16 \endlink to - * perform the configured FFT with the arrays that are specified. + * Uses the plan created by \ref armral_fft_create_plan_cs16 or \ref + * armral_fft_create_2d_plan_cs16 to perform the configured FFT with the arrays + * that are specified. * - * @param[in] p A pointer to the FFT plan. The pointer is the value that is - * filled in by an earlier call to - * \link armral_fft_create_plan_cs16 \endlink. - * @param[in] x The input array for this FFT. The length must be the same as - * the value of `n` that was previously passed to - * \link armral_fft_create_plan_cs16 \endlink. - * @param[out] y The output array for this FFT. The length must be the same as - * the value of `n` that was previously passed to - * \link armral_fft_create_plan_cs16 \endlink. + * @param[in] p A pointer to the FFT plan. + * @param[in] x The input array for this FFT. The array must contain `n` + * elements for 1-d plans, and `n0 * n1` for 2-d plans. + * @param[out] y The output array for this FFT. The array must contain `n` + * elements for 1-d plans, and `n0 * n1` for 2-d plans. * @return An `armral_status` value that indicates success or failure. */ armral_status armral_fft_execute_cs16(const armral_fft_plan_t *p, @@ -3314,16 +3369,14 @@ armral_status armral_fft_execute_cs16(const armral_fft_plan_t *p, armral_cmplx_int16_t *y); /** - * @brief Destroys an FFT plan. + * @brief Destroys an FFT plan. * * The \link armral_fft_execute_cs16 \endlink function frees any associated - * memory, and sets `*p = NULL`, for a plan that was previously created by - * \link armral_fft_create_plan_cs16 \endlink. + * memory, and sets `*p = NULL`, for a plan that was previously created by \ref + * armral_fft_create_plan_cs16 or \ref armral_fft_create_2d_plan_cs16. * - * @param[in,out] p A pointer to the FFT plan pointer. The pointer must be the - * value that is returned by an earlier call to - * \link armral_fft_create_plan_cs16 \endlink. On function - * exit, the value that is pointed to is set to `NULL`. + * @param[in,out] p A pointer to the FFT plan pointer. On function exit, `*p` + * is set to `NULL`. * @return An `armral_status` value that indicates success or failure. */ armral_status armral_fft_destroy_plan_cs16(armral_fft_plan_t **p); @@ -3403,56 +3456,109 @@ typedef struct { } armral_ldpc_base_graph_t; /** - * @enum armral_ldpc_decode_options_t - * @brief A constant which can be passed to `armral_ldpc_decode_block` relevant - * to the CRC operation type. + * Use the default options when doing LDPC decoding. * - * This enumeration defines various options for LDPC decoding, - * including different CRC calculation methods, iteration-based CRC checks, and - * filler bit handling. + * Option used by \ref armral_ldpc_decode_block. * - * Enumeration values: - * - `ARMRAL_LDPC_CRC_NO`: No CRC calculation. - * - `ARMRAL_LDPC_CRC_16`: CRC-16 checksum. - * - `ARMRAL_LDPC_CRC_24A`: CRC-24A checksum. - * - `ARMRAL_LDPC_CRC_24B`: CRC-24B checksum (default). - * - `ARMRAL_LDPC_CRC_EVERY_ITER`: Perform CRC check at every decoding iteration - * (default). - * - `ARMRAL_LDPC_CRC_END_ITER`: Perform CRC check only at the end of all - * decoding iterations. - * - `ARMRAL_LDPC_FILLER_BITS_IMPLICIT`: Implicit handling of filler bits. - * - `ARMRAL_LDPC_FILLER_BITS_EXPLICIT`: Explicit handling of filler bits - * (default). + * Implies \ref ARMRAL_LDPC_CRC_NO and \ref ARMRAL_LDPC_FILLER_BITS_EXPLICIT. */ -typedef enum { - ARMRAL_LDPC_CRC_NO = 0, ///< No CRC calculation - ARMRAL_LDPC_CRC_16, ///< CRC-16 - ARMRAL_LDPC_CRC_24A, ///< CRC-24A - ARMRAL_LDPC_CRC_24B, ///< CRC-24B (default) +#define ARMRAL_LDPC_DEFAULT_OPTIONS 0 + +/** + * The LDPC decoder result should not be checked. + * + * Option used by \ref armral_ldpc_decode_block. + * + * If set, no CRC check is executed and, \ref ARMRAL_LDPC_CRC_EVERY_ITER and + * \ref ARMRAL_LDPC_CRC_END_ITER are ignored. + * + * `ARMRAL_LDPC_CRC_NO` is set by default. + */ +#define ARMRAL_LDPC_CRC_NO (1 << 1) + +/** + * Check for convergence of LDPC decoding using CRC16. + * + * Option used by \ref armral_ldpc_decode_block. + * + * Not currently supported. + */ +#define ARMRAL_LDPC_CRC_16 (1 << 2) + +/** + * Check for convergence of LDPC decoding using CRC24_A. + * + * Option used by \ref armral_ldpc_decode_block. + * + * Not currently supported. + */ +#define ARMRAL_LDPC_CRC_24A (1 << 3) + +/** + * Check for convergence of LDPC decoding using CRC24_B. + * + * Option used by \ref armral_ldpc_decode_block. + * + * If set, a CRC check will be performed during the decoding. The frequency of + * how often the CRC check is done is determined by \ref + * ARMRAL_LDPC_CRC_EVERY_ITER and \ref ARMRAL_LDPC_CRC_END_ITER. If the CRC + * check succeeds, \ref armral_ldpc_decode_block will return `ARMRAL_SUCCESS`. + * If the CRC check fails on the last iteration, \ref armral_ldpc_decode_block + * will return `ARMRAL_FAIL`. + */ +#define ARMRAL_LDPC_CRC_24B (1 << 4) - ARMRAL_LDPC_CRC_EVERY_ITER = (1 << 2), ///< CRC every iteration (default) - ARMRAL_LDPC_CRC_END_ITER = (2 << 2), ///< CRC at end of iterations +/** + * Check for convergence on every iteration of LDPC decoding. + * + * Option used by \ref armral_ldpc_decode_block. + * + * If set, and \ref ARMRAL_LDPC_CRC_NO is not set. A CRC check will be + * run on every iteration. If the CRC check succeeds, + * \ref armral_ldpc_decode_block will return with `ARMRAL_SUCCESS`. If the + * max number of iterations is exceeded without a successful CRC check, + * `armral_ldpc_decode_block` will return `ARMRAL_FAIL`. + * + * `ARMRAL_LDPC_CRC_EVERY_ITER` is set by default. + */ +#define ARMRAL_LDPC_CRC_EVERY_ITER (1 << 5) - ARMRAL_LDPC_FILLER_BITS_IMPLICIT = (1 << 4), ///< Filler bits implicit - ///< Filler bits explicit (default) - ARMRAL_LDPC_FILLER_BITS_EXPLICIT = (2 << 4) -} armral_ldpc_decode_options_t; +/** + * Check for convergence only on the last iteration of LDPC decoding. + * + * Option used by \ref armral_ldpc_decode_block. + * + * If set, and \ref ARMRAL_LDPC_CRC_NO is not set. A CRC check will be run + * after armral_ldpc_decode_block has reached the max number of iterations. If + * the CRC check succeeds, `armral_ldpc_decode_block` returns `ARMRAL_SUCCESS`. + * If the CRC check fails, `armral_ldpc_decode_block` returns `ARMRAL_FAIL`. + */ +#define ARMRAL_LDPC_CRC_END_ITER (1 << 6) /** - * @brief Default options for ARMRAL LDPC processing. + * Assumes the LDPC decoding input has implicit filler bits. * - * This macro defines the default configuration options for ARMRAL LDPC, - * combining the following flags: - * - `ARMRAL_LDPC_CRC_24B`: Enables 24-bit CRC checking. - * - `ARMRAL_LDPC_CRC_EVERY_ITER`: Performs CRC verification at every decoding - * iteration. - * - `ARMRAL_LDPC_FILLER_BITS_EXPLICIT`: Explicitly marks filler bits in - * decoding. + * Option used by \ref armral_ldpc_decode_block. * + * If set, `len_filler_bits` LLRs corresponding to the filler bits + * will be inserted between the LLRs for the message bits and the + * parity bits before attempting to decode. */ -#define ARMRAL_LDPC_DEFAULT_OPTIONS \ - (ARMRAL_LDPC_CRC_24B | ARMRAL_LDPC_CRC_EVERY_ITER | \ - ARMRAL_LDPC_FILLER_BITS_EXPLICIT) +#define ARMRAL_LDPC_FILLER_BITS_IMPLICIT (1 << 7) + +/** + * Assumes the LDPC decoding input has explicit filler bits. + * + * Option used by \ref armral_ldpc_decode_block. + * + * If set, it is assumed that `len_filler_bits` LLRs corresponding to the + * filler bits have been inserted between the LLRs for the message bits and the + * parity bits. This is the format of the output LLRs from \ref + * armral_ldpc_rate_recovery. + * + * `ARMRAL_LDPC_FILLER_BITS_EXPLICIT` is set by default. + */ +#define ARMRAL_LDPC_FILLER_BITS_EXPLICIT (1 << 8) /** * Uses the identifier of a base graph to get the data structure that describes @@ -3601,17 +3707,58 @@ uint32_t armral_ldpc_encode_block_noalloc_buffer_size(armral_ldpc_graph_t bg, * 1, and 42 layers in base graph 2. Decoding is performed for a single code * block. * - * There is the option to use CRC checking as a stopping criteria for the - * iterative decoding. For code blocks with CRC bits attached, the input - * `crc_idx` should be set to the index of the bit where the CRC bits begin, as - * calculated according to section 5.2.2 of the 3GPP Technical Specification - * (TS) 38.212. It is possible that there is no CRC data attached to the code - * block, in which case `ARMRAL_LDPC_NO_CRC` can be passed. + * The `options` parameter can be either \ref ARMRAL_LDPC_DEFAULT_OPTIONS, or a + * bitwise-or'd result of the below fields: + * + * CRC Type: + * - \ref ARMRAL_LDPC_CRC_NO (default) + * - \ref ARMRAL_LDPC_CRC_16 (not implemented) + * - \ref ARMRAL_LDPC_CRC_24A (not implemented) + * - \ref ARMRAL_LDPC_CRC_24B + * + * CRC Mode: + * - \ref ARMRAL_LDPC_CRC_EVERY_ITER (default) + * - \ref ARMRAL_LDPC_CRC_END_ITER + * + * Filler Bits: + * - \ref ARMRAL_LDPC_FILLER_BITS_IMPLICIT + * - \ref ARMRAL_LDPC_FILLER_BITS_EXPLICIT (default). + * + * CRC Type, CRC Mode, and Filler Bits are all mutually exclusive groups. If + * more than one option from a mutually exclusive group is set, + * `armral_ldpc_decode_block` returns `ARMRAL_ARGUMENT_ERROR`. + * + * If \ref ARMRAL_LDPC_CRC_NO (default) is set in `options`, no CRC check is + * performed. If \ref ARMRAL_LDPC_CRC_24B is set in `options`, a CRC check will + * be performed using \ref armral_crc24_b_be. The frequency of how often the + * CRC check is done is determined by \ref ARMRAL_LDPC_CRC_EVERY_ITER (default) + * and \ref ARMRAL_LDPC_CRC_END_ITER. If the CRC check succeeds, + * `armral_ldpc_decode_block` returns `ARMRAL_SUCCESS`. If the CRC check fails + * on the last iteration, `armral_ldpc_decode_block` returns `ARMRAL_FAIL`. + * + * The LDPC decoder supports two input formats, either explicit filler + * bits or implicit filler bits. If \ref + * ARMRAL_LDPC_FILLER_BITS_EXPLICIT (default) is set in `options`, the + * input LLRs must be in the format + * + *
+ *   [ message LLRs | filler LLRs | parity LLRs ]
+ * 
+ * + * If \ref ARMRAL_LDPC_FILLER_BITS_IMPLICIT is set in `options`, the + * input LLRs must be in the format + * + *
+ *   [ message LLRs | parity LLRs ]
+ * 
+ * + * `ARMRAL_LDPC_FILLER_BITS_EXPLICIT` should be used if `llrs` was generated by + * \ref armral_ldpc_rate_recovery. * * @param[in] n The length of `llrs`. * @param[in] llrs The initial LLRs to use in the decoding. This is * typically the output after demodulation and rate - * recovery. Supports 8 bit llrs in q1.7. + * recovery. * @param[in] bg The type of base graph to use for the decoding. * @param[in] z The lifting size. Valid values of the lifting size are * described in table 5.3.2-1 in TS 38.212. @@ -3620,65 +3767,94 @@ uint32_t armral_ldpc_encode_block_noalloc_buffer_size(armral_ldpc_graph_t bg, * ensure that the code block segments have a valid * length and are a multiple of the lifting size. * Filler bits are used to calculate CRC internally. - * This is assumed to be a multiple of 8bits. + * This is assumed to be a multiple of 8. * @param[out] data_out The decoded bits. These are of length `22 * z` for base * graph 1 and `10 * z` for base graph 2. It is assumed * that the array `data_out` is able to store this many * bits. - * @param[in] max_its The maximum number of iterations of the LDPC decoder to - * run. The algorithm may terminate after fewer iterations - * if the current candidate codeword passes all the parity + * @param[in] max_its The maximum number of iterations of the LDPC decoder. + * The algorithm may terminate after fewer iterations if + * the current candidate codeword passes all the parity * checks, or if it satisfies the CRC check. - * @param[in] options It is an OR'd result of the below fields, - * CRC Type: - * ARMRAL_LDPC_CRC_NO - * ARMRAL_LDPC_CRC_16 - * ARMRAL_LDPC_CRC_24A - * ARMRAL_LDPC_CRC_24B (default) - * CRC Mode: - * ARMRAL_LDPC_CRC_EVERY_ITER (default) - * ARMRAL_LDPC_CRC_END_ITER - * Filler Bits: - * ARMRAL_LDPC_FILLER_BITS_IMPLICIT - * ARMRAL_LDPC_FILLER_BITS_EXPLICIT (default). - * @return An `armral_status` value that indicates success or failure. + * @param[in] options See the documentation above for a summary of available + * `options`. If you want to use the default options, set + * the options parameter to either `0` or + * `ARMRAL_LDPC_DEFAULT_OPTIONS`. + * @return Returns `ARMRAL_SUCCESS` on success, `ARMRAL_ARGUMENT_ERROR` if + * an input parameter is incorrect, or `ARMRAL_FAIL` if the CRC + * check for convergence fails. */ armral_status armral_ldpc_decode_block(uint32_t n, const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z, uint32_t len_filler_bits, uint8_t *data_out, uint32_t max_its, - armral_ldpc_decode_options_t options); + uint32_t options); /** * Non-allocating variant of \link armral_ldpc_decode_block \endlink. * - * This function performs decoding of LDPC using a layered min-sum algorithm. - * This is an iterative algorithm which takes 8-bit log-likelihood ratios (LLRs) - * and calculates the most likely codeword by calculating updates using - * information available from the parity checks in the LDPC graph. LLRs are - * updated after evaluating checks in a 'layer', where a layer is assumed to - * contain the same number of checks as the lifting size `z`. There are 46 - * layers in base graph 1, and 42 layers in base graph 2. Decoding is performed - * for a single code block. + * Performs decoding of LDPC using a layered min-sum algorithm. This is an + * iterative algorithm which takes 8-bit log-likelihood ratios (LLRs) and + * calculates the most likely codeword by calculating updates using information + * available from the parity checks in the LDPC graph. LLRs are updated after + * evaluating checks in a 'layer', where a layer is assumed to contain the same + * number of checks as the lifting size `z`. There are 46 layers in base graph + * 1, and 42 layers in base graph 2. Decoding is performed for a single code + * block. * - * There is the option to use CRC checking as a stopping criteria for the - * iterative decoding. For code blocks with CRC bits attached, the input - * `crc_idx` should be set to the index of the bit where the CRC bits begin, as - * calculated according to section 5.2.2 of the 3GPP Technical Specification - * (TS) 38.212. It is possible that there is no CRC data attached to the code - * block, in which case `ARMRAL_LDPC_NO_CRC` can be passed. * - * This function takes a pre-allocated buffer (`buffer`) to use internally. - * This variant will not call any system memory allocators. + * The `options` parameter can be either \ref ARMRAL_LDPC_DEFAULT_OPTIONS, or a + * bitwise-or'd result of the below fields: * - * The buffer must be at least as large as the number of bytes returned by - * calling \link armral_ldpc_decode_block_noalloc_buffer_size \endlink - * with identical inputs. + * CRC Type: + * - \ref ARMRAL_LDPC_CRC_NO (default) + * - \ref ARMRAL_LDPC_CRC_16 (not implemented) + * - \ref ARMRAL_LDPC_CRC_24A (not implemented) + * - \ref ARMRAL_LDPC_CRC_24B + * + * CRC Mode: + * - \ref ARMRAL_LDPC_CRC_EVERY_ITER (default) + * - \ref ARMRAL_LDPC_CRC_END_ITER + * + * Filler Bits: + * - \ref ARMRAL_LDPC_FILLER_BITS_IMPLICIT + * - \ref ARMRAL_LDPC_FILLER_BITS_EXPLICIT (default). + * + * CRC Type, CRC Mode, and Filler Bits are all mutually exclusive groups. If + * more than one option from a mutually exclusive group is set, + * `armral_ldpc_decode_block` returns `ARMRAL_ARGUMENT_ERROR`. + * + * If \ref ARMRAL_LDPC_CRC_NO (default) is set in `options`, no CRC check is + * performed. If \ref ARMRAL_LDPC_CRC_24B is set in `options`, a CRC check will + * be performed using \ref armral_crc24_b_be. The frequency of how often the + * CRC check is done is determined by \ref ARMRAL_LDPC_CRC_EVERY_ITER (default) + * and \ref ARMRAL_LDPC_CRC_END_ITER. If the CRC check succeeds, + * `armral_ldpc_decode_block` returns `ARMRAL_SUCCESS`. If the CRC check fails + * on the last iteration, `armral_ldpc_decode_block` returns `ARMRAL_FAIL`. + * + * The LDPC decoder supports two input formats, either explicit filler + * bits or implicit filler bits. If \ref + * ARMRAL_LDPC_FILLER_BITS_EXPLICIT (default) is set in `options`, the + * input LLRs must be in the format + * + *
+ *   [ message LLRs | filler LLRs | parity LLRs ]
+ * 
+ * + * If \ref ARMRAL_LDPC_FILLER_BITS_IMPLICIT is set in `options`, the + * input LLRs must be in the format + * + *
+ *   [ message LLRs | parity LLRs ]
+ * 
+ * + * `ARMRAL_LDPC_FILLER_BITS_EXPLICIT` should be used if `llrs` was generated by + * \ref armral_ldpc_rate_recovery. * * @param[in] n The length of `llrs`. * @param[in] llrs The initial LLRs to use in the decoding. This is * typically the output after demodulation and rate - * recovery. Supports 8 bit llrs in q1.7. + * recovery. * @param[in] bg The type of base graph to use for the decoding. * @param[in] z The lifting size. Valid values of the lifting size are * described in table 5.3.2-1 in TS 38.212. @@ -3687,34 +3863,28 @@ armral_status armral_ldpc_decode_block(uint32_t n, const int8_t *llrs, * ensure that the code block segments have a valid * length and are a multiple of the lifting size. * Filler bits are used to calculate CRC internally. - * This is assumed to be a multiple of 8bits. + * This is assumed to be a multiple of 8. * @param[out] data_out The decoded bits. These are of length `22 * z` for base * graph 1 and `10 * z` for base graph 2. It is assumed * that the array `data_out` is able to store this many * bits. - * @param[in] max_its The maximum number of iterations of the LDPC decoder to - * run. The algorithm may terminate after fewer iterations - * if the current candidate codeword passes all the parity + * @param[in] max_its The maximum number of iterations of the LDPC decoder. + * The algorithm may terminate after fewer iterations if + * the current candidate codeword passes all the parity * checks, or if it satisfies the CRC check. - * @param[in] options It is an OR'd result of the below fields, - * CRC Type: - * ARMRAL_LDPC_CRC_NO - * ARMRAL_LDPC_CRC_16 - * ARMRAL_LDPC_CRC_24A - * ARMRAL_LDPC_CRC_24B (default) - * CRC Mode: - * ARMRAL_LDPC_CRC_EVERY_ITER (default) - * ARMRAL_LDPC_CRC_END_ITER - * Filler Bits: - * ARMRAL_LDPC_FILLER_BITS_IMPLICIT - * ARMRAL_LDPC_FILLER_BITS_EXPLICIT (default). + * @param[in] options See the documentation above for a summary of available + * `options`. If you want to use the default options, set + * the options parameter to either `0` or + * `ARMRAL_LDPC_DEFAULT_OPTIONS`. * @param[in] buffer Workspace buffer to be used internally. - * @return An `armral_status` value that indicates success or failure. + * @return Returns `ARMRAL_SUCCESS` on success, `ARMRAL_ARGUMENT_ERROR` if + * an input parameter is incorrect, or `ARMRAL_FAIL` if the CRC + * check for convergence fails. */ armral_status armral_ldpc_decode_block_noalloc( uint32_t n, const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z, uint32_t len_filler_bits, uint8_t *data_out, uint32_t max_its, - armral_ldpc_decode_options_t options, void *buffer); + uint32_t options, void *buffer); /** * Calculates the required buffer size in bytes needed to perform LDPC decoding @@ -3723,9 +3893,9 @@ armral_status armral_ldpc_decode_block_noalloc( * @param[in] bg The type of base graph to use for the decoding. * @param[in] z The lifting size. Valid values of the lifting size are * described in table 5.3.2-1 in TS 38.212. - * @param[in] max_its The maximum number of iterations of the LDPC decoder to - * run. The algorithm may terminate after fewer iterations - * if the current candidate codeword passes all the parity + * @param[in] max_its The maximum number of iterations of the LDPC decoder. + * The algorithm may terminate after fewer iterations if + * the current candidate codeword passes all the parity * checks, or if it satisfies the CRC check. * @return The required buffer size in bytes. */ diff --git a/simulation/CMakeLists.txt b/simulation/CMakeLists.txt index 7c610ebfff05f7cd2c5d071fd8f2dfd31264318e..c9aedfe9d9edb2d31bac6d80fe59239513ee22fe 100644 --- a/simulation/CMakeLists.txt +++ b/simulation/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.3) +cmake_minimum_required(VERSION 3.10) project(simulation_all VERSION 0.0) include(CheckCCompilerFlag) @@ -13,6 +13,10 @@ add_library(simulation_common INTERFACE) target_include_directories(simulation_common INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include) +set(ARMRAL_SIMULTATION_TIMEOUT + 3000 + CACHE STRING "Configure timeout for simulation (seconds)") + find_package(OpenMP) find_package(Threads) if(Threads_FOUND) @@ -67,9 +71,8 @@ if(Threads_FOUND) target_link_libraries(${SIM_NAME} PUBLIC simulation_common armral armral_awgn armral_utils) target_link_libraries(${SIM_NAME} PRIVATE OpenMP::OpenMP_CXX) - target_compile_options( - ${SIM_NAME} PRIVATE ${SIM_COMPILE_OPTIONS} ${SIM_COMPILER_FLAGS} - "$<$:-Og>") + target_compile_options(${SIM_NAME} PRIVATE ${SIM_COMPILE_OPTIONS} + ${SIM_COMPILER_FLAGS}) add_dependencies(simulation ${SIM_NAME}) @@ -80,7 +83,9 @@ if(Threads_FOUND) # are not using a test running wrapper. add_test(NAME ${SIM_NAME} COMMAND ${CMAKE_CURRENT_BINARY_DIR}/${SIM_NAME} ${SIM_CMD_LINE_OPTS}) - set_tests_properties(${SIM_NAME} PROPERTIES TIMEOUT 3000) + set_tests_properties(${SIM_NAME} PROPERTIES TIMEOUT + ${ARMRAL_SIMULTATION_TIMEOUT}) + if(ARMRAL_ENABLE_ASAN) # Avoid slow-downs in newer versions of Address Santizier # https://github.com/llvm/llvm-project/issues/64190 @@ -94,7 +99,7 @@ if(Threads_FOUND) endfunction() add_armral_sim(convolutional_awgn "-k;8;-m;0;-u;128") - add_armral_sim(ldpc_awgn "-z;3;-b;1;-m;0;-r;0;-u;128") + add_armral_sim(ldpc_awgn "-z;3;-b;2;-m;0;-r;0;-u;128") add_armral_sim(modulation_awgn "-k;32;-m;0;-u;128") add_armral_sim(polar_awgn "-k;32;-e;32;-l;1;-m;0;-i;0;-u;128") add_armral_sim(turbo_awgn "-k;40;-m;0;-i;1;-e;60") diff --git a/simulation/awgn/CMakeLists.txt b/simulation/awgn/CMakeLists.txt index b5f552237f6ee944e898690552ce22bbdd52ec24..061b598e0cf9d6de5dd121f94828176060941d1f 100644 --- a/simulation/awgn/CMakeLists.txt +++ b/simulation/awgn/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.3) +cmake_minimum_required(VERSION 3.10) project(awgn VERSION 0.0) set(AWGN_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/awgn.cpp) @@ -10,6 +10,5 @@ set(AWGN_COMPILER_FLAGS $<$:-Wshadow -Wall -Wcast-qual add_library(armral_awgn ${AWGN_SOURCES}) target_include_directories(armral_awgn PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) target_link_libraries(armral_awgn PUBLIC armral armral_utils) -target_compile_options( - armral_awgn PRIVATE ${AWGN_COMPILE_OPTIONS} ${AWGN_COMPILER_FLAGS} - "$<$:-Og>") +target_compile_options(armral_awgn PRIVATE ${AWGN_COMPILE_OPTIONS} + ${AWGN_COMPILER_FLAGS}) diff --git a/simulation/ldpc_awgn/ldpc_awgn.cpp b/simulation/ldpc_awgn/ldpc_awgn.cpp index 29a7729103bbe6f4928f094a3a6f08d27e84a020..0a3827afc2f668dfc6088f765e721f562a454b34 100644 --- a/simulation/ldpc_awgn/ldpc_awgn.cpp +++ b/simulation/ldpc_awgn/ldpc_awgn.cpp @@ -314,7 +314,7 @@ bool run_snr(uint32_t z, armral_modulation_type mod_type, double tolerance = 1.0e-9; int nb = 0; - int n = z * graph->ncodeword_bits; + int n = z * graph->nmessage_bits; uint64_t nr_total = 0; uint32_t num_message_errors = 0; while (nb < 10 && nr_total < 1e6) { diff --git a/src/LowerPHY/FFT/bluestein.cpp b/src/LowerPHY/FFT/bluestein.cpp index ab6224c1396e990ae7af0e6e0cab637bb47c25ec..3596403d1d672614c8cbd3c11009075a5e08bc70 100644 --- a/src/LowerPHY/FFT/bluestein.cpp +++ b/src/LowerPHY/FFT/bluestein.cpp @@ -53,11 +53,11 @@ make_bluestein(int n, armral_fft_direction_t dir, const int *base_kernels, } // Create 2 plans: forward and backward - armral_fft_plan_t *pf = nullptr; - armral_fft_plan_t *pb = nullptr; - auto pf_status = armral::fft::create_plan( + plan_1d_t *pf = nullptr; + plan_1d_t *pb = nullptr; + auto pf_status = armral::fft::create_plan_1d_internal( &pf, n_pad, armral_fft_direction_t::ARMRAL_FFT_FORWARDS, false); - auto pb_status = armral::fft::create_plan( + auto pb_status = armral::fft::create_plan_1d_internal( &pb, n_pad, armral_fft_direction_t::ARMRAL_FFT_BACKWARDS, false); if (pf_status == ARMRAL_ARGUMENT_ERROR || @@ -71,7 +71,7 @@ make_bluestein(int n, armral_fft_direction_t dir, const int *base_kernels, } // Execute fwds plan transforming series b - armral::fft::execute(pf, b, b, 1, 1, 1); + armral::fft::execute_1d(pf, b, b, 1, 0, 1, 0, 1); // Multiply output from FFT of b with 1/n_pad real_t recip_npad = 1.0 / n_pad; @@ -173,14 +173,16 @@ void execute_bluestein(const bluestein &bs, const Tx *x, Ty *y, multiply_a_x(work_ptr, &x[i * idist], bs.a, bs.n, bs.n_pad, istride); } - armral::fft::execute(bs.pf, work_ptr, work_ptr, 1, 1, 1); + armral::fft::execute_1d(bs.pf, work_ptr, work_ptr, 1, 0, 1, 0, + 1); for (int j = 0; j < bs.n_pad; j++) { Tw tmp = {work_ptr[j].re * bs.b[j].re - work_ptr[j].im * bs.b[j].im, work_ptr[j].re * bs.b[j].im + work_ptr[j].im * bs.b[j].re}; work_ptr[j] = tmp; } - armral::fft::execute(bs.pb, work_ptr, work_ptr, 1, 1, 1); + armral::fft::execute_1d(bs.pb, work_ptr, work_ptr, 1, 0, 1, 0, + 1); // Multiply by a and store in output vector y multiply_y_a(work_ptr, &y[i * odist], bs.a, bs.n, ostride, bs.dir); diff --git a/src/LowerPHY/FFT/bluestein.hpp b/src/LowerPHY/FFT/bluestein.hpp index 77acb4093a9bc7c3f27bb23cbc031d2addc874f4..80829f1bfbad73f4316157ee237f80918aa6fc2f 100644 --- a/src/LowerPHY/FFT/bluestein.hpp +++ b/src/LowerPHY/FFT/bluestein.hpp @@ -20,8 +20,8 @@ struct bluestein { int n_pad; armral_fft_direction_t dir; - armral_fft_plan_t *pf; - armral_fft_plan_t *pb; + plan_1d_t *pf; + plan_1d_t *pb; const Tw *a; const Tw *b; @@ -45,8 +45,7 @@ struct bluestein { bluestein &operator=(bluestein &&) = delete; bluestein(int n_in, int n_pad_in, armral_fft_direction_t dir_in, - armral_fft_plan_t *pf_in, armral_fft_plan_t *pb_in, Tw *a_in, - Tw *b_in) + plan_1d_t *pf_in, plan_1d_t *pb_in, Tw *a_in, Tw *b_in) : n(n_in), n_pad(n_pad_in), dir(dir_in), pf(pf_in), pb(pb_in), a(a_in), b(b_in) {} diff --git a/src/LowerPHY/FFT/fft_cf32.cpp b/src/LowerPHY/FFT/fft_cf32.cpp index 2d8b1d6f1044d2e5c2c90839504bcd4cb4765ed7..1f3d5739fd1a17ba371b045ec4efc64c64e1d515 100644 --- a/src/LowerPHY/FFT/fft_cf32.cpp +++ b/src/LowerPHY/FFT/fft_cf32.cpp @@ -9,15 +9,22 @@ armral_status armral_fft_create_plan_cf32(armral_fft_plan_t **p, int n, armral_fft_direction_t dir) { - return armral::fft::create_plan(p, n, dir, true); + return armral::fft::create_plan_1d(p, n, dir, true); +} + +armral_status armral_fft_create_2d_plan_cf32(armral_fft_plan_t **p, int n0, + int n1, + armral_fft_direction_t dir) { + return armral::fft::create_plan_2d(p, n0, n1, dir, true); } armral_status armral_fft_execute_cf32(const armral_fft_plan_t *p, const armral_cmplx_f32_t *x, armral_cmplx_f32_t *y) { return armral::fft::execute(p, x, y, 1, 1, 1); + armral_cmplx_f32_t>(p, x, y); } armral_status armral_fft_destroy_plan_cf32(armral_fft_plan_t **p) { diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c index 99f255e14e85f18942c6b4803e49183db893542c..3a3d6f9692d460172cd575863268c5cb446ac64e 100644 --- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c +++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c @@ -14801,3 +14801,1507 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, } } #endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ab_t_gs32(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v407 = v5[istride]; + float v1418 = 7.0710678118654757e-01F; + float v1429 = -7.0710678118654746e-01F; + float v1475 = 5.5557023301960229e-01F; + float v1489 = -1.9509032201612861e-01F; + float v1536 = 9.2387953251128674e-01F; + float v1543 = -9.2387953251128685e-01F; + float v1546 = 3.8268343236508967e-01F; + float v1547 = -3.8268343236508967e-01F; + float v1589 = 1.9509032201612833e-01F; + float v1592 = -9.8078528040323043e-01F; + float v1593 = 9.8078528040323043e-01F; + float v1600 = -5.5557023301960218e-01F; + float v1603 = 8.3146961230254524e-01F; + float v1604 = -8.3146961230254524e-01F; + float v1614 = -1.0000000000000000e+00F; + float v1615 = 1.0000000000000000e+00F; + float32x2_t v1617 = (float32x2_t){v4, v4}; + float32x2_t v444 = vtrn1_f32(v407, v407); + float32x2_t v445 = vtrn2_f32(v407, v407); + float32x2_t v851 = v5[0]; + float32x2_t v1248 = (float32x2_t){v1593, v1593}; + float32x2_t v1305 = (float32x2_t){v1536, v1536}; + float32x2_t v1309 = (float32x2_t){v1547, v1546}; + float32x2_t v1362 = (float32x2_t){v1603, v1603}; + float32x2_t v1366 = (float32x2_t){v1600, v1475}; + float32x2_t v1373 = (float32x2_t){v1489, v1489}; + float32x2_t v1419 = (float32x2_t){v1418, v1418}; + float32x2_t v1430 = (float32x2_t){v1429, v1429}; + float32x2_t v1434 = (float32x2_t){v1615, v1614}; + float32x2_t v1476 = (float32x2_t){v1475, v1475}; + float32x2_t v1480 = (float32x2_t){v1604, v1603}; + float32x2_t v1487 = (float32x2_t){v1592, v1592}; + float32x2_t v1491 = (float32x2_t){v1489, v1589}; + float32x2_t v1533 = (float32x2_t){v1546, v1546}; + float32x2_t v1537 = (float32x2_t){v1543, v1536}; + float32x2_t v1544 = (float32x2_t){v1543, v1543}; + float32x2_t v1548 = (float32x2_t){v1546, v1547}; + float32x2_t v1590 = (float32x2_t){v1589, v1589}; + float32x2_t v1594 = (float32x2_t){v1592, v1593}; + float32x2_t v1601 = (float32x2_t){v1600, v1600}; + float32x2_t v1605 = (float32x2_t){v1603, v1604}; + float32x2_t v1616 = (float32x2_t){v1614, v1615}; + float32x2_t v20 = v5[istride * 16]; + int64_t v37 = 30 + j * 62; + float32x2_t v51 = v5[istride * 8]; + int64_t v55 = 14 + j * 62; + float32x2_t v69 = v5[istride * 24]; + int64_t v73 = 46 + j * 62; + float32x2_t v87 = v5[istride * 4]; + float32x2_t v105 = v5[istride * 20]; + int64_t v122 = 6 + j * 62; + int64_t v135 = 38 + j * 62; + float32x2_t v149 = v5[istride * 12]; + float32x2_t v167 = v5[istride * 28]; + int64_t v184 = 22 + j * 62; + int64_t v197 = 54 + j * 62; + float32x2_t v211 = v5[istride * 2]; + float32x2_t v229 = v5[istride * 18]; + int64_t v246 = 2 + j * 62; + int64_t v259 = 34 + j * 62; + float32x2_t v273 = v5[istride * 10]; + int64_t v277 = 18 + j * 62; + float32x2_t v291 = v5[istride * 26]; + int64_t v295 = 50 + j * 62; + float32x2_t v309 = v5[istride * 6]; + float32x2_t v327 = v5[istride * 22]; + int64_t v344 = 10 + j * 62; + int64_t v357 = 42 + j * 62; + float32x2_t v371 = v5[istride * 14]; + int64_t v375 = 26 + j * 62; + float32x2_t v389 = v5[istride * 30]; + int64_t v393 = 58 + j * 62; + float32x2_t v425 = v5[istride * 17]; + float32x2_t v443 = v7[j * 62]; + int64_t v447 = j * 62 + 1; + int64_t v455 = 32 + j * 62; + float32x2_t v469 = v5[istride * 9]; + int64_t v473 = 16 + j * 62; + float32x2_t v487 = v5[istride * 25]; + int64_t v491 = 48 + j * 62; + float32x2_t v505 = v5[istride * 5]; + float32x2_t v523 = v5[istride * 21]; + int64_t v540 = 8 + j * 62; + int64_t v553 = 40 + j * 62; + float32x2_t v567 = v5[istride * 13]; + float32x2_t v585 = v5[istride * 29]; + int64_t v602 = 24 + j * 62; + int64_t v615 = 56 + j * 62; + float32x2_t v629 = v5[istride * 3]; + float32x2_t v647 = v5[istride * 19]; + int64_t v664 = 4 + j * 62; + int64_t v677 = 36 + j * 62; + float32x2_t v691 = v5[istride * 11]; + int64_t v695 = 20 + j * 62; + float32x2_t v709 = v5[istride * 27]; + int64_t v713 = 52 + j * 62; + float32x2_t v727 = v5[istride * 7]; + float32x2_t v745 = v5[istride * 23]; + int64_t v762 = 12 + j * 62; + int64_t v775 = 44 + j * 62; + float32x2_t v789 = v5[istride * 15]; + float32x2_t v807 = v5[istride * 31]; + int64_t v824 = 28 + j * 62; + int64_t v837 = 60 + j * 62; + float32x2_t v1311 = vmul_f32(v1617, v1309); + float32x2_t v1368 = vmul_f32(v1617, v1366); + float32x2_t v1436 = vmul_f32(v1617, v1434); + float32x2_t v1482 = vmul_f32(v1617, v1480); + float32x2_t v1493 = vmul_f32(v1617, v1491); + float32x2_t v1539 = vmul_f32(v1617, v1537); + float32x2_t v1550 = vmul_f32(v1617, v1548); + float32x2_t v1596 = vmul_f32(v1617, v1594); + float32x2_t v1607 = vmul_f32(v1617, v1605); + float32x2_t v1618 = vmul_f32(v1617, v1616); + float32x2_t v38 = v7[v37]; + float32x2_t v39 = vtrn1_f32(v20, v20); + float32x2_t v40 = vtrn2_f32(v20, v20); + int64_t v42 = v37 + 1; + float32x2_t v56 = v7[v55]; + float32x2_t v57 = vtrn1_f32(v51, v51); + float32x2_t v58 = vtrn2_f32(v51, v51); + int64_t v60 = v55 + 1; + float32x2_t v74 = v7[v73]; + float32x2_t v75 = vtrn1_f32(v69, v69); + float32x2_t v76 = vtrn2_f32(v69, v69); + int64_t v78 = v73 + 1; + float32x2_t v123 = v7[v122]; + float32x2_t v124 = vtrn1_f32(v87, v87); + float32x2_t v125 = vtrn2_f32(v87, v87); + int64_t v127 = v122 + 1; + float32x2_t v136 = v7[v135]; + float32x2_t v137 = vtrn1_f32(v105, v105); + float32x2_t v138 = vtrn2_f32(v105, v105); + int64_t v140 = v135 + 1; + float32x2_t v185 = v7[v184]; + float32x2_t v186 = vtrn1_f32(v149, v149); + float32x2_t v187 = vtrn2_f32(v149, v149); + int64_t v189 = v184 + 1; + float32x2_t v198 = v7[v197]; + float32x2_t v199 = vtrn1_f32(v167, v167); + float32x2_t v200 = vtrn2_f32(v167, v167); + int64_t v202 = v197 + 1; + float32x2_t v247 = v7[v246]; + float32x2_t v248 = vtrn1_f32(v211, v211); + float32x2_t v249 = vtrn2_f32(v211, v211); + int64_t v251 = v246 + 1; + float32x2_t v260 = v7[v259]; + float32x2_t v261 = vtrn1_f32(v229, v229); + float32x2_t v262 = vtrn2_f32(v229, v229); + int64_t v264 = v259 + 1; + float32x2_t v278 = v7[v277]; + float32x2_t v279 = vtrn1_f32(v273, v273); + float32x2_t v280 = vtrn2_f32(v273, v273); + int64_t v282 = v277 + 1; + float32x2_t v296 = v7[v295]; + float32x2_t v297 = vtrn1_f32(v291, v291); + float32x2_t v298 = vtrn2_f32(v291, v291); + int64_t v300 = v295 + 1; + float32x2_t v345 = v7[v344]; + float32x2_t v346 = vtrn1_f32(v309, v309); + float32x2_t v347 = vtrn2_f32(v309, v309); + int64_t v349 = v344 + 1; + float32x2_t v358 = v7[v357]; + float32x2_t v359 = vtrn1_f32(v327, v327); + float32x2_t v360 = vtrn2_f32(v327, v327); + int64_t v362 = v357 + 1; + float32x2_t v376 = v7[v375]; + float32x2_t v377 = vtrn1_f32(v371, v371); + float32x2_t v378 = vtrn2_f32(v371, v371); + int64_t v380 = v375 + 1; + float32x2_t v394 = v7[v393]; + float32x2_t v395 = vtrn1_f32(v389, v389); + float32x2_t v396 = vtrn2_f32(v389, v389); + int64_t v398 = v393 + 1; + float32x2_t v448 = v7[v447]; + float32x2_t v449 = vmul_f32(v444, v443); + float32x2_t v456 = v7[v455]; + float32x2_t v457 = vtrn1_f32(v425, v425); + float32x2_t v458 = vtrn2_f32(v425, v425); + int64_t v460 = v455 + 1; + float32x2_t v474 = v7[v473]; + float32x2_t v475 = vtrn1_f32(v469, v469); + float32x2_t v476 = vtrn2_f32(v469, v469); + int64_t v478 = v473 + 1; + float32x2_t v492 = v7[v491]; + float32x2_t v493 = vtrn1_f32(v487, v487); + float32x2_t v494 = vtrn2_f32(v487, v487); + int64_t v496 = v491 + 1; + float32x2_t v541 = v7[v540]; + float32x2_t v542 = vtrn1_f32(v505, v505); + float32x2_t v543 = vtrn2_f32(v505, v505); + int64_t v545 = v540 + 1; + float32x2_t v554 = v7[v553]; + float32x2_t v555 = vtrn1_f32(v523, v523); + float32x2_t v556 = vtrn2_f32(v523, v523); + int64_t v558 = v553 + 1; + float32x2_t v603 = v7[v602]; + float32x2_t v604 = vtrn1_f32(v567, v567); + float32x2_t v605 = vtrn2_f32(v567, v567); + int64_t v607 = v602 + 1; + float32x2_t v616 = v7[v615]; + float32x2_t v617 = vtrn1_f32(v585, v585); + float32x2_t v618 = vtrn2_f32(v585, v585); + int64_t v620 = v615 + 1; + float32x2_t v665 = v7[v664]; + float32x2_t v666 = vtrn1_f32(v629, v629); + float32x2_t v667 = vtrn2_f32(v629, v629); + int64_t v669 = v664 + 1; + float32x2_t v678 = v7[v677]; + float32x2_t v679 = vtrn1_f32(v647, v647); + float32x2_t v680 = vtrn2_f32(v647, v647); + int64_t v682 = v677 + 1; + float32x2_t v696 = v7[v695]; + float32x2_t v697 = vtrn1_f32(v691, v691); + float32x2_t v698 = vtrn2_f32(v691, v691); + int64_t v700 = v695 + 1; + float32x2_t v714 = v7[v713]; + float32x2_t v715 = vtrn1_f32(v709, v709); + float32x2_t v716 = vtrn2_f32(v709, v709); + int64_t v718 = v713 + 1; + float32x2_t v763 = v7[v762]; + float32x2_t v764 = vtrn1_f32(v727, v727); + float32x2_t v765 = vtrn2_f32(v727, v727); + int64_t v767 = v762 + 1; + float32x2_t v776 = v7[v775]; + float32x2_t v777 = vtrn1_f32(v745, v745); + float32x2_t v778 = vtrn2_f32(v745, v745); + int64_t v780 = v775 + 1; + float32x2_t v825 = v7[v824]; + float32x2_t v826 = vtrn1_f32(v789, v789); + float32x2_t v827 = vtrn2_f32(v789, v789); + int64_t v829 = v824 + 1; + float32x2_t v838 = v7[v837]; + float32x2_t v839 = vtrn1_f32(v807, v807); + float32x2_t v840 = vtrn2_f32(v807, v807); + int64_t v842 = v837 + 1; + float32x2_t v43 = v7[v42]; + float32x2_t v44 = vmul_f32(v39, v38); + float32x2_t v61 = v7[v60]; + float32x2_t v62 = vmul_f32(v57, v56); + float32x2_t v79 = v7[v78]; + float32x2_t v80 = vmul_f32(v75, v74); + float32x2_t v128 = v7[v127]; + float32x2_t v129 = vmul_f32(v124, v123); + float32x2_t v141 = v7[v140]; + float32x2_t v142 = vmul_f32(v137, v136); + float32x2_t v190 = v7[v189]; + float32x2_t v191 = vmul_f32(v186, v185); + float32x2_t v203 = v7[v202]; + float32x2_t v204 = vmul_f32(v199, v198); + float32x2_t v252 = v7[v251]; + float32x2_t v253 = vmul_f32(v248, v247); + float32x2_t v265 = v7[v264]; + float32x2_t v266 = vmul_f32(v261, v260); + float32x2_t v283 = v7[v282]; + float32x2_t v284 = vmul_f32(v279, v278); + float32x2_t v301 = v7[v300]; + float32x2_t v302 = vmul_f32(v297, v296); + float32x2_t v350 = v7[v349]; + float32x2_t v351 = vmul_f32(v346, v345); + float32x2_t v363 = v7[v362]; + float32x2_t v364 = vmul_f32(v359, v358); + float32x2_t v381 = v7[v380]; + float32x2_t v382 = vmul_f32(v377, v376); + float32x2_t v399 = v7[v398]; + float32x2_t v400 = vmul_f32(v395, v394); + float32x2_t v461 = v7[v460]; + float32x2_t v462 = vmul_f32(v457, v456); + float32x2_t v479 = v7[v478]; + float32x2_t v480 = vmul_f32(v475, v474); + float32x2_t v497 = v7[v496]; + float32x2_t v498 = vmul_f32(v493, v492); + float32x2_t v546 = v7[v545]; + float32x2_t v547 = vmul_f32(v542, v541); + float32x2_t v559 = v7[v558]; + float32x2_t v560 = vmul_f32(v555, v554); + float32x2_t v608 = v7[v607]; + float32x2_t v609 = vmul_f32(v604, v603); + float32x2_t v621 = v7[v620]; + float32x2_t v622 = vmul_f32(v617, v616); + float32x2_t v670 = v7[v669]; + float32x2_t v671 = vmul_f32(v666, v665); + float32x2_t v683 = v7[v682]; + float32x2_t v684 = vmul_f32(v679, v678); + float32x2_t v701 = v7[v700]; + float32x2_t v702 = vmul_f32(v697, v696); + float32x2_t v719 = v7[v718]; + float32x2_t v720 = vmul_f32(v715, v714); + float32x2_t v768 = v7[v767]; + float32x2_t v769 = vmul_f32(v764, v763); + float32x2_t v781 = v7[v780]; + float32x2_t v782 = vmul_f32(v777, v776); + float32x2_t v830 = v7[v829]; + float32x2_t v831 = vmul_f32(v826, v825); + float32x2_t v843 = v7[v842]; + float32x2_t v844 = vmul_f32(v839, v838); + float32x2_t v451 = vfma_f32(v449, v445, v448); + float32x2_t v46 = vfma_f32(v44, v40, v43); + float32x2_t v64 = vfma_f32(v62, v58, v61); + float32x2_t v82 = vfma_f32(v80, v76, v79); + float32x2_t v131 = vfma_f32(v129, v125, v128); + float32x2_t v144 = vfma_f32(v142, v138, v141); + float32x2_t v193 = vfma_f32(v191, v187, v190); + float32x2_t v206 = vfma_f32(v204, v200, v203); + float32x2_t v255 = vfma_f32(v253, v249, v252); + float32x2_t v268 = vfma_f32(v266, v262, v265); + float32x2_t v286 = vfma_f32(v284, v280, v283); + float32x2_t v304 = vfma_f32(v302, v298, v301); + float32x2_t v353 = vfma_f32(v351, v347, v350); + float32x2_t v366 = vfma_f32(v364, v360, v363); + float32x2_t v384 = vfma_f32(v382, v378, v381); + float32x2_t v402 = vfma_f32(v400, v396, v399); + float32x2_t v464 = vfma_f32(v462, v458, v461); + float32x2_t v482 = vfma_f32(v480, v476, v479); + float32x2_t v500 = vfma_f32(v498, v494, v497); + float32x2_t v549 = vfma_f32(v547, v543, v546); + float32x2_t v562 = vfma_f32(v560, v556, v559); + float32x2_t v611 = vfma_f32(v609, v605, v608); + float32x2_t v624 = vfma_f32(v622, v618, v621); + float32x2_t v673 = vfma_f32(v671, v667, v670); + float32x2_t v686 = vfma_f32(v684, v680, v683); + float32x2_t v704 = vfma_f32(v702, v698, v701); + float32x2_t v722 = vfma_f32(v720, v716, v719); + float32x2_t v771 = vfma_f32(v769, v765, v768); + float32x2_t v784 = vfma_f32(v782, v778, v781); + float32x2_t v833 = vfma_f32(v831, v827, v830); + float32x2_t v846 = vfma_f32(v844, v840, v843); + float32x2_t v852 = vadd_f32(v851, v46); + float32x2_t v853 = vsub_f32(v851, v46); + float32x2_t v854 = vadd_f32(v64, v82); + float32x2_t v855 = vsub_f32(v64, v82); + float32x2_t v867 = vadd_f32(v131, v144); + float32x2_t v868 = vsub_f32(v131, v144); + float32x2_t v869 = vadd_f32(v193, v206); + float32x2_t v870 = vsub_f32(v193, v206); + float32x2_t v921 = vadd_f32(v255, v268); + float32x2_t v922 = vsub_f32(v255, v268); + float32x2_t v923 = vadd_f32(v286, v304); + float32x2_t v924 = vsub_f32(v286, v304); + float32x2_t v936 = vadd_f32(v353, v366); + float32x2_t v937 = vsub_f32(v353, v366); + float32x2_t v938 = vadd_f32(v384, v402); + float32x2_t v939 = vsub_f32(v384, v402); + float32x2_t v1075 = vadd_f32(v451, v464); + float32x2_t v1076 = vsub_f32(v451, v464); + float32x2_t v1077 = vadd_f32(v482, v500); + float32x2_t v1078 = vsub_f32(v482, v500); + float32x2_t v1090 = vadd_f32(v549, v562); + float32x2_t v1091 = vsub_f32(v549, v562); + float32x2_t v1092 = vadd_f32(v611, v624); + float32x2_t v1093 = vsub_f32(v611, v624); + float32x2_t v1144 = vadd_f32(v673, v686); + float32x2_t v1145 = vsub_f32(v673, v686); + float32x2_t v1146 = vadd_f32(v704, v722); + float32x2_t v1147 = vsub_f32(v704, v722); + float32x2_t v1159 = vadd_f32(v771, v784); + float32x2_t v1160 = vsub_f32(v771, v784); + float32x2_t v1161 = vadd_f32(v833, v846); + float32x2_t v1162 = vsub_f32(v833, v846); + float32x2_t v861 = vrev64_f32(v855); + float32x2_t v863 = vadd_f32(v852, v854); + float32x2_t v864 = vsub_f32(v852, v854); + float32x2_t v871 = vadd_f32(v867, v869); + float32x2_t v872 = vsub_f32(v867, v869); + float32x2_t v887 = vmul_f32(v868, v1419); + float32x2_t v898 = vmul_f32(v870, v1430); + float32x2_t v930 = vrev64_f32(v924); + float32x2_t v932 = vadd_f32(v921, v923); + float32x2_t v933 = vsub_f32(v921, v923); + float32x2_t v945 = vrev64_f32(v939); + float32x2_t v947 = vadd_f32(v936, v938); + float32x2_t v948 = vsub_f32(v936, v938); + float32x2_t v1084 = vrev64_f32(v1078); + float32x2_t v1086 = vadd_f32(v1075, v1077); + float32x2_t v1087 = vsub_f32(v1075, v1077); + float32x2_t v1094 = vadd_f32(v1090, v1092); + float32x2_t v1095 = vsub_f32(v1090, v1092); + float32x2_t v1110 = vmul_f32(v1091, v1419); + float32x2_t v1121 = vmul_f32(v1093, v1430); + float32x2_t v1153 = vrev64_f32(v1147); + float32x2_t v1155 = vadd_f32(v1144, v1146); + float32x2_t v1156 = vsub_f32(v1144, v1146); + float32x2_t v1163 = vadd_f32(v1159, v1161); + float32x2_t v1164 = vsub_f32(v1159, v1161); + float32x2_t v1179 = vmul_f32(v1160, v1419); + float32x2_t v1190 = vmul_f32(v1162, v1430); + float32x2_t v862 = vmul_f32(v861, v1436); + float32x2_t v878 = vrev64_f32(v872); + float32x2_t v880 = vadd_f32(v863, v871); + float32x2_t v881 = vsub_f32(v863, v871); + float32x2_t v893 = vrev64_f32(v887); + float32x2_t v904 = vrev64_f32(v898); + float32x2_t v931 = vmul_f32(v930, v1436); + float32x2_t v946 = vmul_f32(v945, v1436); + float32x2_t v951 = vadd_f32(v932, v947); + float32x2_t v952 = vsub_f32(v932, v947); + float32x2_t v1004 = vmul_f32(v933, v1419); + float32x2_t v1015 = vmul_f32(v948, v1430); + float32x2_t v1085 = vmul_f32(v1084, v1436); + float32x2_t v1101 = vrev64_f32(v1095); + float32x2_t v1103 = vadd_f32(v1086, v1094); + float32x2_t v1104 = vsub_f32(v1086, v1094); + float32x2_t v1116 = vrev64_f32(v1110); + float32x2_t v1127 = vrev64_f32(v1121); + float32x2_t v1154 = vmul_f32(v1153, v1436); + float32x2_t v1170 = vrev64_f32(v1164); + float32x2_t v1172 = vadd_f32(v1155, v1163); + float32x2_t v1173 = vsub_f32(v1155, v1163); + float32x2_t v1185 = vrev64_f32(v1179); + float32x2_t v1196 = vrev64_f32(v1190); + float32x2_t v865 = vsub_f32(v853, v862); + float32x2_t v866 = vadd_f32(v853, v862); + float32x2_t v879 = vmul_f32(v878, v1436); + float32x2_t v894 = vmul_f32(v893, v1618); + float32x2_t v905 = vmul_f32(v904, v1436); + float32x2_t v934 = vsub_f32(v922, v931); + float32x2_t v935 = vadd_f32(v922, v931); + float32x2_t v949 = vsub_f32(v937, v946); + float32x2_t v950 = vadd_f32(v937, v946); + float32x2_t v958 = vrev64_f32(v952); + float32x2_t v960 = vadd_f32(v880, v951); + float32x2_t v961 = vsub_f32(v880, v951); + float32x2_t v1010 = vrev64_f32(v1004); + float32x2_t v1021 = vrev64_f32(v1015); + float32x2_t v1088 = vsub_f32(v1076, v1085); + float32x2_t v1089 = vadd_f32(v1076, v1085); + float32x2_t v1102 = vmul_f32(v1101, v1436); + float32x2_t v1117 = vmul_f32(v1116, v1618); + float32x2_t v1128 = vmul_f32(v1127, v1436); + float32x2_t v1157 = vsub_f32(v1145, v1154); + float32x2_t v1158 = vadd_f32(v1145, v1154); + float32x2_t v1171 = vmul_f32(v1170, v1436); + float32x2_t v1186 = vmul_f32(v1185, v1618); + float32x2_t v1197 = vmul_f32(v1196, v1436); + float32x2_t v1213 = vadd_f32(v1103, v1172); + float32x2_t v1214 = vsub_f32(v1103, v1172); + float32x2_t v1420 = vmul_f32(v1104, v1419); + float32x2_t v1431 = vmul_f32(v1173, v1430); + float32x2_t v882 = vsub_f32(v864, v879); + float32x2_t v883 = vadd_f32(v864, v879); + float32x2_t v906 = vadd_f32(v887, v894); + float32x2_t v907 = vadd_f32(v898, v905); + float32x2_t v959 = vmul_f32(v958, v1436); + float32x2_t v967 = vmul_f32(v934, v1305); + float32x2_t v973 = vrev64_f32(v934); + float32x2_t v978 = vmul_f32(v949, v1533); + float32x2_t v984 = vrev64_f32(v949); + float32x2_t v1011 = vmul_f32(v1010, v1618); + float32x2_t v1022 = vmul_f32(v1021, v1436); + float32x2_t v1041 = vmul_f32(v935, v1533); + float32x2_t v1047 = vrev64_f32(v935); + float32x2_t v1052 = vmul_f32(v950, v1544); + float32x2_t v1058 = vrev64_f32(v950); + float32x2_t v1105 = vsub_f32(v1087, v1102); + float32x2_t v1106 = vadd_f32(v1087, v1102); + float32x2_t v1129 = vadd_f32(v1110, v1117); + float32x2_t v1130 = vadd_f32(v1121, v1128); + float32x2_t v1174 = vsub_f32(v1156, v1171); + float32x2_t v1175 = vadd_f32(v1156, v1171); + float32x2_t v1198 = vadd_f32(v1179, v1186); + float32x2_t v1199 = vadd_f32(v1190, v1197); + float32x2_t v1220 = vrev64_f32(v1214); + float32x2_t v1222 = vadd_f32(v960, v1213); + float32x2_t v1223 = vsub_f32(v960, v1213); + float32x2_t v1426 = vrev64_f32(v1420); + float32x2_t v1437 = vrev64_f32(v1431); + float32x2_t v908 = vadd_f32(v906, v907); + float32x2_t v909 = vsub_f32(v907, v906); + float32x2_t v962 = vsub_f32(v881, v959); + float32x2_t v963 = vadd_f32(v881, v959); + float32x2_t v1023 = vadd_f32(v1004, v1011); + float32x2_t v1024 = vadd_f32(v1015, v1022); + float32x2_t v1131 = vadd_f32(v1129, v1130); + float32x2_t v1132 = vsub_f32(v1130, v1129); + float32x2_t v1200 = vadd_f32(v1198, v1199); + float32x2_t v1201 = vsub_f32(v1199, v1198); + float32x2_t v1221 = vmul_f32(v1220, v1436); + v6[0] = v1222; + v6[ostride * 16] = v1223; + float32x2_t v1306 = vmul_f32(v1105, v1305); + float32x2_t v1312 = vrev64_f32(v1105); + float32x2_t v1317 = vmul_f32(v1174, v1533); + float32x2_t v1323 = vrev64_f32(v1174); + float32x2_t v1427 = vmul_f32(v1426, v1618); + float32x2_t v1438 = vmul_f32(v1437, v1436); + float32x2_t v1534 = vmul_f32(v1106, v1533); + float32x2_t v1540 = vrev64_f32(v1106); + float32x2_t v1545 = vmul_f32(v1175, v1544); + float32x2_t v1551 = vrev64_f32(v1175); + float32x2_t v915 = vrev64_f32(v909); + float32x2_t v917 = vadd_f32(v865, v908); + float32x2_t v918 = vsub_f32(v865, v908); + float32x2_t v986 = vfma_f32(v967, v973, v1311); + float32x2_t v987 = vfma_f32(v978, v984, v1539); + float32x2_t v1025 = vadd_f32(v1023, v1024); + float32x2_t v1026 = vsub_f32(v1024, v1023); + float32x2_t v1060 = vfma_f32(v1041, v1047, v1539); + float32x2_t v1061 = vfma_f32(v1052, v1058, v1550); + float32x2_t v1138 = vrev64_f32(v1132); + float32x2_t v1140 = vadd_f32(v1088, v1131); + float32x2_t v1141 = vsub_f32(v1088, v1131); + float32x2_t v1207 = vrev64_f32(v1201); + float32x2_t v1209 = vadd_f32(v1157, v1200); + float32x2_t v1210 = vsub_f32(v1157, v1200); + float32x2_t v1224 = vsub_f32(v961, v1221); + float32x2_t v1225 = vadd_f32(v961, v1221); + float32x2_t v1439 = vadd_f32(v1420, v1427); + float32x2_t v1440 = vadd_f32(v1431, v1438); + float32x2_t v916 = vmul_f32(v915, v1618); + float32x2_t v988 = vadd_f32(v986, v987); + float32x2_t v989 = vsub_f32(v987, v986); + float32x2_t v1032 = vrev64_f32(v1026); + float32x2_t v1034 = vadd_f32(v882, v1025); + float32x2_t v1035 = vsub_f32(v882, v1025); + float32x2_t v1062 = vadd_f32(v1060, v1061); + float32x2_t v1063 = vsub_f32(v1061, v1060); + float32x2_t v1139 = vmul_f32(v1138, v1618); + float32x2_t v1208 = vmul_f32(v1207, v1618); + v6[ostride * 8] = v1224; + v6[ostride * 24] = v1225; + float32x2_t v1249 = vmul_f32(v1140, v1248); + float32x2_t v1255 = vrev64_f32(v1140); + float32x2_t v1260 = vmul_f32(v1209, v1362); + float32x2_t v1266 = vrev64_f32(v1209); + float32x2_t v1325 = vfma_f32(v1306, v1312, v1311); + float32x2_t v1326 = vfma_f32(v1317, v1323, v1539); + float32x2_t v1441 = vadd_f32(v1439, v1440); + float32x2_t v1442 = vsub_f32(v1440, v1439); + float32x2_t v1477 = vmul_f32(v1141, v1476); + float32x2_t v1483 = vrev64_f32(v1141); + float32x2_t v1488 = vmul_f32(v1210, v1487); + float32x2_t v1494 = vrev64_f32(v1210); + float32x2_t v1553 = vfma_f32(v1534, v1540, v1539); + float32x2_t v1554 = vfma_f32(v1545, v1551, v1550); + float32x2_t v919 = vsub_f32(v866, v916); + float32x2_t v920 = vadd_f32(v866, v916); + float32x2_t v995 = vrev64_f32(v989); + float32x2_t v997 = vadd_f32(v917, v988); + float32x2_t v998 = vsub_f32(v917, v988); + float32x2_t v1033 = vmul_f32(v1032, v1618); + float32x2_t v1069 = vrev64_f32(v1063); + float32x2_t v1142 = vsub_f32(v1089, v1139); + float32x2_t v1143 = vadd_f32(v1089, v1139); + float32x2_t v1211 = vsub_f32(v1158, v1208); + float32x2_t v1212 = vadd_f32(v1158, v1208); + float32x2_t v1327 = vadd_f32(v1325, v1326); + float32x2_t v1328 = vsub_f32(v1326, v1325); + float32x2_t v1448 = vrev64_f32(v1442); + float32x2_t v1450 = vadd_f32(v962, v1441); + float32x2_t v1451 = vsub_f32(v962, v1441); + float32x2_t v1555 = vadd_f32(v1553, v1554); + float32x2_t v1556 = vsub_f32(v1554, v1553); + float32x2_t v996 = vmul_f32(v995, v1618); + float32x2_t v1036 = vsub_f32(v883, v1033); + float32x2_t v1037 = vadd_f32(v883, v1033); + float32x2_t v1070 = vmul_f32(v1069, v1618); + float32x2_t v1071 = vadd_f32(v919, v1062); + float32x2_t v1072 = vsub_f32(v919, v1062); + float32x2_t v1268 = vfma_f32(v1249, v1255, v1493); + float32x2_t v1269 = vfma_f32(v1260, v1266, v1368); + float32x2_t v1334 = vrev64_f32(v1328); + float32x2_t v1336 = vadd_f32(v1034, v1327); + float32x2_t v1337 = vsub_f32(v1034, v1327); + float32x2_t v1363 = vmul_f32(v1142, v1362); + float32x2_t v1369 = vrev64_f32(v1142); + float32x2_t v1374 = vmul_f32(v1211, v1373); + float32x2_t v1380 = vrev64_f32(v1211); + float32x2_t v1449 = vmul_f32(v1448, v1618); + v6[ostride * 4] = v1450; + v6[ostride * 20] = v1451; + float32x2_t v1496 = vfma_f32(v1477, v1483, v1482); + float32x2_t v1497 = vfma_f32(v1488, v1494, v1493); + float32x2_t v1562 = vrev64_f32(v1556); + float32x2_t v1591 = vmul_f32(v1143, v1590); + float32x2_t v1597 = vrev64_f32(v1143); + float32x2_t v1602 = vmul_f32(v1212, v1601); + float32x2_t v1608 = vrev64_f32(v1212); + float32x2_t v999 = vsub_f32(v918, v996); + float32x2_t v1000 = vadd_f32(v918, v996); + float32x2_t v1073 = vsub_f32(v920, v1070); + float32x2_t v1074 = vadd_f32(v920, v1070); + float32x2_t v1270 = vadd_f32(v1268, v1269); + float32x2_t v1271 = vsub_f32(v1269, v1268); + float32x2_t v1335 = vmul_f32(v1334, v1618); + v6[ostride * 2] = v1336; + v6[ostride * 18] = v1337; + float32x2_t v1452 = vsub_f32(v963, v1449); + float32x2_t v1453 = vadd_f32(v963, v1449); + float32x2_t v1498 = vadd_f32(v1496, v1497); + float32x2_t v1499 = vsub_f32(v1497, v1496); + float32x2_t v1563 = vmul_f32(v1562, v1618); + float32x2_t v1564 = vadd_f32(v1036, v1555); + float32x2_t v1565 = vsub_f32(v1036, v1555); + float32x2_t v1277 = vrev64_f32(v1271); + float32x2_t v1279 = vadd_f32(v997, v1270); + float32x2_t v1280 = vsub_f32(v997, v1270); + float32x2_t v1338 = vsub_f32(v1035, v1335); + float32x2_t v1339 = vadd_f32(v1035, v1335); + float32x2_t v1382 = vfma_f32(v1363, v1369, v1368); + float32x2_t v1383 = vfma_f32(v1374, v1380, v1596); + v6[ostride * 12] = v1452; + v6[ostride * 28] = v1453; + float32x2_t v1505 = vrev64_f32(v1499); + float32x2_t v1507 = vadd_f32(v999, v1498); + float32x2_t v1508 = vsub_f32(v999, v1498); + float32x2_t v1566 = vsub_f32(v1037, v1563); + float32x2_t v1567 = vadd_f32(v1037, v1563); + v6[ostride * 6] = v1564; + v6[ostride * 22] = v1565; + float32x2_t v1610 = vfma_f32(v1591, v1597, v1596); + float32x2_t v1611 = vfma_f32(v1602, v1608, v1607); + float32x2_t v1278 = vmul_f32(v1277, v1618); + v6[ostride] = v1279; + v6[ostride * 17] = v1280; + v6[ostride * 10] = v1338; + v6[ostride * 26] = v1339; + float32x2_t v1384 = vadd_f32(v1382, v1383); + float32x2_t v1385 = vsub_f32(v1383, v1382); + float32x2_t v1506 = vmul_f32(v1505, v1618); + v6[ostride * 5] = v1507; + v6[ostride * 21] = v1508; + v6[ostride * 14] = v1566; + v6[ostride * 30] = v1567; + float32x2_t v1612 = vadd_f32(v1610, v1611); + float32x2_t v1613 = vsub_f32(v1611, v1610); + float32x2_t v1281 = vsub_f32(v998, v1278); + float32x2_t v1282 = vadd_f32(v998, v1278); + float32x2_t v1391 = vrev64_f32(v1385); + float32x2_t v1393 = vadd_f32(v1071, v1384); + float32x2_t v1394 = vsub_f32(v1071, v1384); + float32x2_t v1509 = vsub_f32(v1000, v1506); + float32x2_t v1510 = vadd_f32(v1000, v1506); + float32x2_t v1619 = vrev64_f32(v1613); + float32x2_t v1621 = vadd_f32(v1073, v1612); + float32x2_t v1622 = vsub_f32(v1073, v1612); + v6[ostride * 9] = v1281; + v6[ostride * 25] = v1282; + float32x2_t v1392 = vmul_f32(v1391, v1618); + v6[ostride * 3] = v1393; + v6[ostride * 19] = v1394; + v6[ostride * 13] = v1509; + v6[ostride * 29] = v1510; + float32x2_t v1620 = vmul_f32(v1619, v1618); + v6[ostride * 7] = v1621; + v6[ostride * 23] = v1622; + float32x2_t v1395 = vsub_f32(v1072, v1392); + float32x2_t v1396 = vadd_f32(v1072, v1392); + float32x2_t v1623 = vsub_f32(v1074, v1620); + float32x2_t v1624 = vadd_f32(v1074, v1620); + v6[ostride * 11] = v1395; + v6[ostride * 27] = v1396; + v6[ostride * 15] = v1623; + v6[ostride * 31] = v1624; + v5 += 1 * idist; + v6 += 1 * odist; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ab_t_gs32(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + int64_t v3 = odist; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * v3; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + int64_t v13 = j; + float v1167 = -1.9509032201612819e-01F; + float v1222 = 7.0710678118654757e-01F; + float v1234 = -7.0710678118654746e-01F; + float v1239 = -1.0000000000000000e+00F; + float v1289 = 5.5557023301960229e-01F; + float v1294 = 8.3146961230254524e-01F; + float v1301 = -9.8078528040323043e-01F; + float v1356 = 3.8268343236508984e-01F; + float v1361 = 9.2387953251128674e-01F; + float v1368 = -9.2387953251128685e-01F; + float v1373 = -3.8268343236508967e-01F; + float v1423 = 1.9509032201612833e-01F; + float v1428 = 9.8078528040323043e-01F; + float v1435 = -5.5557023301960218e-01F; + float v1440 = -8.3146961230254524e-01F; + const float32x2_t *v1630 = &v5[v0]; + float32x2_t *v1868 = &v6[v2]; + int64_t v19 = v0 * 16; + int64_t v34 = v10 * 15; + int64_t v40 = v0 * 8; + int64_t v48 = v10 * 7; + int64_t v54 = v0 * 24; + int64_t v62 = v10 * 23; + int64_t v68 = v0 * 4; + int64_t v82 = v0 * 20; + int64_t v97 = v10 * 3; + int64_t v104 = v10 * 19; + int64_t v110 = v0 * 12; + int64_t v124 = v0 * 28; + int64_t v139 = v10 * 11; + int64_t v146 = v10 * 27; + int64_t v152 = v0 * 2; + int64_t v166 = v0 * 18; + int64_t v188 = v10 * 17; + int64_t v194 = v0 * 10; + int64_t v202 = v10 * 9; + int64_t v208 = v0 * 26; + int64_t v216 = v10 * 25; + int64_t v222 = v0 * 6; + int64_t v236 = v0 * 22; + int64_t v251 = v10 * 5; + int64_t v258 = v10 * 21; + int64_t v264 = v0 * 14; + int64_t v272 = v10 * 13; + int64_t v278 = v0 * 30; + int64_t v286 = v10 * 29; + int64_t v306 = v0 * 17; + int64_t v328 = v10 * 16; + int64_t v334 = v0 * 9; + int64_t v342 = v10 * 8; + int64_t v348 = v0 * 25; + int64_t v356 = v10 * 24; + int64_t v362 = v0 * 5; + int64_t v376 = v0 * 21; + int64_t v391 = v10 * 4; + int64_t v398 = v10 * 20; + int64_t v404 = v0 * 13; + int64_t v418 = v0 * 29; + int64_t v433 = v10 * 12; + int64_t v440 = v10 * 28; + int64_t v446 = v0 * 3; + int64_t v460 = v0 * 19; + int64_t v475 = v10 * 2; + int64_t v482 = v10 * 18; + int64_t v488 = v0 * 11; + int64_t v496 = v10 * 10; + int64_t v502 = v0 * 27; + int64_t v510 = v10 * 26; + int64_t v516 = v0 * 7; + int64_t v530 = v0 * 23; + int64_t v545 = v10 * 6; + int64_t v552 = v10 * 22; + int64_t v558 = v0 * 15; + int64_t v572 = v0 * 31; + int64_t v587 = v10 * 14; + int64_t v594 = v10 * 30; + int64_t v595 = v13 * 31; + int64_t v1000 = v2 * 8; + int64_t v1007 = v2 * 16; + int64_t v1014 = v2 * 24; + int64_t v1067 = v2 * 9; + int64_t v1074 = v2 * 17; + int64_t v1081 = v2 * 25; + float v1096 = v4 * v1356; + int64_t v1127 = v2 * 2; + int64_t v1134 = v2 * 10; + int64_t v1141 = v2 * 18; + int64_t v1148 = v2 * 26; + float v1163 = v4 * v1289; + int64_t v1194 = v2 * 3; + int64_t v1201 = v2 * 11; + int64_t v1208 = v2 * 19; + int64_t v1215 = v2 * 27; + float v1242 = v4 * v1239; + int64_t v1261 = v2 * 4; + int64_t v1268 = v2 * 12; + int64_t v1275 = v2 * 20; + int64_t v1282 = v2 * 28; + float v1297 = v4 * v1294; + float v1309 = v4 * v1423; + int64_t v1328 = v2 * 5; + int64_t v1335 = v2 * 13; + int64_t v1342 = v2 * 21; + int64_t v1349 = v2 * 29; + float v1364 = v4 * v1361; + float v1376 = v4 * v1373; + int64_t v1395 = v2 * 6; + int64_t v1402 = v2 * 14; + int64_t v1409 = v2 * 22; + int64_t v1416 = v2 * 30; + float v1431 = v4 * v1428; + float v1443 = v4 * v1440; + int64_t v1462 = v2 * 7; + int64_t v1469 = v2 * 15; + int64_t v1476 = v2 * 23; + int64_t v1483 = v2 * 31; + const float32x2_t *v1777 = &v5[0]; + svint64_t v1778 = svindex_s64(0, v1); + float32x2_t *v1827 = &v6[0]; + svfloat32_t v1857 = svdup_n_f32(v1428); + svfloat32_t v1898 = svdup_n_f32(v1361); + svfloat32_t v1939 = svdup_n_f32(v1294); + svfloat32_t v1941 = svdup_n_f32(v1167); + svfloat32_t v1980 = svdup_n_f32(v1222); + svfloat32_t v1982 = svdup_n_f32(v1234); + svfloat32_t v2021 = svdup_n_f32(v1289); + svfloat32_t v2023 = svdup_n_f32(v1301); + svfloat32_t v2062 = svdup_n_f32(v1356); + svfloat32_t v2064 = svdup_n_f32(v1368); + svfloat32_t v2103 = svdup_n_f32(v1423); + svfloat32_t v2105 = svdup_n_f32(v1435); + svfloat32_t v2107 = svdup_n_f32(v4); + svint64_t v2142 = svindex_s64(0, v3); + int64_t v36 = v34 + v595; + int64_t v50 = v48 + v595; + int64_t v64 = v62 + v595; + int64_t v99 = v97 + v595; + int64_t v106 = v104 + v595; + int64_t v141 = v139 + v595; + int64_t v148 = v146 + v595; + int64_t v183 = v10 + v595; + int64_t v190 = v188 + v595; + int64_t v204 = v202 + v595; + int64_t v218 = v216 + v595; + int64_t v253 = v251 + v595; + int64_t v260 = v258 + v595; + int64_t v274 = v272 + v595; + int64_t v288 = v286 + v595; + svfloat32_t v324 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v595])); + int64_t v330 = v328 + v595; + int64_t v344 = v342 + v595; + int64_t v358 = v356 + v595; + int64_t v393 = v391 + v595; + int64_t v400 = v398 + v595; + int64_t v435 = v433 + v595; + int64_t v442 = v440 + v595; + int64_t v477 = v475 + v595; + int64_t v484 = v482 + v595; + int64_t v498 = v496 + v595; + int64_t v512 = v510 + v595; + int64_t v547 = v545 + v595; + int64_t v554 = v552 + v595; + int64_t v589 = v587 + v595; + int64_t v596 = v594 + v595; + const float32x2_t *v1495 = &v5[v19]; + const float32x2_t *v1504 = &v5[v40]; + const float32x2_t *v1513 = &v5[v54]; + const float32x2_t *v1522 = &v5[v68]; + const float32x2_t *v1531 = &v5[v82]; + const float32x2_t *v1540 = &v5[v110]; + const float32x2_t *v1549 = &v5[v124]; + const float32x2_t *v1558 = &v5[v152]; + const float32x2_t *v1567 = &v5[v166]; + const float32x2_t *v1576 = &v5[v194]; + const float32x2_t *v1585 = &v5[v208]; + const float32x2_t *v1594 = &v5[v222]; + const float32x2_t *v1603 = &v5[v236]; + const float32x2_t *v1612 = &v5[v264]; + const float32x2_t *v1621 = &v5[v278]; + svfloat32_t v1632 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1630), v1778)); + const float32x2_t *v1640 = &v5[v306]; + const float32x2_t *v1650 = &v5[v334]; + const float32x2_t *v1659 = &v5[v348]; + const float32x2_t *v1668 = &v5[v362]; + const float32x2_t *v1677 = &v5[v376]; + const float32x2_t *v1686 = &v5[v404]; + const float32x2_t *v1695 = &v5[v418]; + const float32x2_t *v1704 = &v5[v446]; + const float32x2_t *v1713 = &v5[v460]; + const float32x2_t *v1722 = &v5[v488]; + const float32x2_t *v1731 = &v5[v502]; + const float32x2_t *v1740 = &v5[v516]; + const float32x2_t *v1749 = &v5[v530]; + const float32x2_t *v1758 = &v5[v558]; + const float32x2_t *v1767 = &v5[v572]; + svfloat32_t v1779 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1777), v1778)); + float32x2_t *v1836 = &v6[v1000]; + float32x2_t *v1845 = &v6[v1007]; + float32x2_t *v1854 = &v6[v1014]; + float32x2_t *v1877 = &v6[v1067]; + float32x2_t *v1886 = &v6[v1074]; + float32x2_t *v1895 = &v6[v1081]; + svfloat32_t v1899 = svdup_n_f32(v1096); + float32x2_t *v1909 = &v6[v1127]; + float32x2_t *v1918 = &v6[v1134]; + float32x2_t *v1927 = &v6[v1141]; + float32x2_t *v1936 = &v6[v1148]; + svfloat32_t v1940 = svdup_n_f32(v1163); + float32x2_t *v1950 = &v6[v1194]; + float32x2_t *v1959 = &v6[v1201]; + float32x2_t *v1968 = &v6[v1208]; + float32x2_t *v1977 = &v6[v1215]; + svfloat32_t v1983 = svdup_n_f32(v1242); + float32x2_t *v1991 = &v6[v1261]; + float32x2_t *v2000 = &v6[v1268]; + float32x2_t *v2009 = &v6[v1275]; + float32x2_t *v2018 = &v6[v1282]; + svfloat32_t v2022 = svdup_n_f32(v1297); + svfloat32_t v2024 = svdup_n_f32(v1309); + float32x2_t *v2032 = &v6[v1328]; + float32x2_t *v2041 = &v6[v1335]; + float32x2_t *v2050 = &v6[v1342]; + float32x2_t *v2059 = &v6[v1349]; + svfloat32_t v2063 = svdup_n_f32(v1364); + svfloat32_t v2065 = svdup_n_f32(v1376); + float32x2_t *v2073 = &v6[v1395]; + float32x2_t *v2082 = &v6[v1402]; + float32x2_t *v2091 = &v6[v1409]; + float32x2_t *v2100 = &v6[v1416]; + svfloat32_t v2104 = svdup_n_f32(v1431); + svfloat32_t v2106 = svdup_n_f32(v1443); + float32x2_t *v2114 = &v6[v1462]; + float32x2_t *v2123 = &v6[v1469]; + float32x2_t *v2132 = &v6[v1476]; + float32x2_t *v2141 = &v6[v1483]; + svfloat32_t v37 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v36])); + svfloat32_t v51 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v50])); + svfloat32_t v65 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v64])); + svfloat32_t v100 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v99])); + svfloat32_t v107 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v106])); + svfloat32_t v142 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v141])); + svfloat32_t v149 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v148])); + svfloat32_t v184 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v183])); + svfloat32_t v191 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v190])); + svfloat32_t v205 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v204])); + svfloat32_t v219 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v218])); + svfloat32_t v254 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v253])); + svfloat32_t v261 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v260])); + svfloat32_t v275 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v274])); + svfloat32_t v289 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v288])); + svfloat32_t zero325 = svdup_n_f32(0); + svfloat32_t v325 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero325, v1632, v324, 0), v1632, + v324, 90); + svfloat32_t v331 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v330])); + svfloat32_t v345 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v344])); + svfloat32_t v359 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v358])); + svfloat32_t v394 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v393])); + svfloat32_t v401 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v400])); + svfloat32_t v436 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v435])); + svfloat32_t v443 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v442])); + svfloat32_t v478 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v477])); + svfloat32_t v485 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v484])); + svfloat32_t v499 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v498])); + svfloat32_t v513 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v512])); + svfloat32_t v548 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v547])); + svfloat32_t v555 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v554])); + svfloat32_t v590 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v589])); + svfloat32_t v597 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v596])); + svfloat32_t v1497 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1495), v1778)); + svfloat32_t v1506 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1504), v1778)); + svfloat32_t v1515 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1513), v1778)); + svfloat32_t v1524 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1522), v1778)); + svfloat32_t v1533 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1531), v1778)); + svfloat32_t v1542 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1540), v1778)); + svfloat32_t v1551 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1549), v1778)); + svfloat32_t v1560 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1558), v1778)); + svfloat32_t v1569 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1567), v1778)); + svfloat32_t v1578 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1576), v1778)); + svfloat32_t v1587 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1585), v1778)); + svfloat32_t v1596 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1594), v1778)); + svfloat32_t v1605 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1603), v1778)); + svfloat32_t v1614 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1612), v1778)); + svfloat32_t v1623 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1621), v1778)); + svfloat32_t v1642 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1640), v1778)); + svfloat32_t v1652 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1650), v1778)); + svfloat32_t v1661 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1659), v1778)); + svfloat32_t v1670 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1668), v1778)); + svfloat32_t v1679 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1677), v1778)); + svfloat32_t v1688 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1686), v1778)); + svfloat32_t v1697 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1695), v1778)); + svfloat32_t v1706 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1704), v1778)); + svfloat32_t v1715 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1713), v1778)); + svfloat32_t v1724 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1722), v1778)); + svfloat32_t v1733 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1731), v1778)); + svfloat32_t v1742 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1740), v1778)); + svfloat32_t v1751 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1749), v1778)); + svfloat32_t v1760 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1758), v1778)); + svfloat32_t v1769 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1767), v1778)); + svfloat32_t zero38 = svdup_n_f32(0); + svfloat32_t v38 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v1497, v37, 0), + v1497, v37, 90); + svfloat32_t zero52 = svdup_n_f32(0); + svfloat32_t v52 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v1506, v51, 0), + v1506, v51, 90); + svfloat32_t zero66 = svdup_n_f32(0); + svfloat32_t v66 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero66, v1515, v65, 0), + v1515, v65, 90); + svfloat32_t zero101 = svdup_n_f32(0); + svfloat32_t v101 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero101, v1524, v100, 0), v1524, + v100, 90); + svfloat32_t zero108 = svdup_n_f32(0); + svfloat32_t v108 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero108, v1533, v107, 0), v1533, + v107, 90); + svfloat32_t zero143 = svdup_n_f32(0); + svfloat32_t v143 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero143, v1542, v142, 0), v1542, + v142, 90); + svfloat32_t zero150 = svdup_n_f32(0); + svfloat32_t v150 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero150, v1551, v149, 0), v1551, + v149, 90); + svfloat32_t zero185 = svdup_n_f32(0); + svfloat32_t v185 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero185, v1560, v184, 0), v1560, + v184, 90); + svfloat32_t zero192 = svdup_n_f32(0); + svfloat32_t v192 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero192, v1569, v191, 0), v1569, + v191, 90); + svfloat32_t zero206 = svdup_n_f32(0); + svfloat32_t v206 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero206, v1578, v205, 0), v1578, + v205, 90); + svfloat32_t zero220 = svdup_n_f32(0); + svfloat32_t v220 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero220, v1587, v219, 0), v1587, + v219, 90); + svfloat32_t zero255 = svdup_n_f32(0); + svfloat32_t v255 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero255, v1596, v254, 0), v1596, + v254, 90); + svfloat32_t zero262 = svdup_n_f32(0); + svfloat32_t v262 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero262, v1605, v261, 0), v1605, + v261, 90); + svfloat32_t zero276 = svdup_n_f32(0); + svfloat32_t v276 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero276, v1614, v275, 0), v1614, + v275, 90); + svfloat32_t zero290 = svdup_n_f32(0); + svfloat32_t v290 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero290, v1623, v289, 0), v1623, + v289, 90); + svfloat32_t zero332 = svdup_n_f32(0); + svfloat32_t v332 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero332, v1642, v331, 0), v1642, + v331, 90); + svfloat32_t zero346 = svdup_n_f32(0); + svfloat32_t v346 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero346, v1652, v345, 0), v1652, + v345, 90); + svfloat32_t zero360 = svdup_n_f32(0); + svfloat32_t v360 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero360, v1661, v359, 0), v1661, + v359, 90); + svfloat32_t zero395 = svdup_n_f32(0); + svfloat32_t v395 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero395, v1670, v394, 0), v1670, + v394, 90); + svfloat32_t zero402 = svdup_n_f32(0); + svfloat32_t v402 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero402, v1679, v401, 0), v1679, + v401, 90); + svfloat32_t zero437 = svdup_n_f32(0); + svfloat32_t v437 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero437, v1688, v436, 0), v1688, + v436, 90); + svfloat32_t zero444 = svdup_n_f32(0); + svfloat32_t v444 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero444, v1697, v443, 0), v1697, + v443, 90); + svfloat32_t zero479 = svdup_n_f32(0); + svfloat32_t v479 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero479, v1706, v478, 0), v1706, + v478, 90); + svfloat32_t zero486 = svdup_n_f32(0); + svfloat32_t v486 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero486, v1715, v485, 0), v1715, + v485, 90); + svfloat32_t zero500 = svdup_n_f32(0); + svfloat32_t v500 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero500, v1724, v499, 0), v1724, + v499, 90); + svfloat32_t zero514 = svdup_n_f32(0); + svfloat32_t v514 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero514, v1733, v513, 0), v1733, + v513, 90); + svfloat32_t zero549 = svdup_n_f32(0); + svfloat32_t v549 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero549, v1742, v548, 0), v1742, + v548, 90); + svfloat32_t zero556 = svdup_n_f32(0); + svfloat32_t v556 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero556, v1751, v555, 0), v1751, + v555, 90); + svfloat32_t zero591 = svdup_n_f32(0); + svfloat32_t v591 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero591, v1760, v590, 0), v1760, + v590, 90); + svfloat32_t zero598 = svdup_n_f32(0); + svfloat32_t v598 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero598, v1769, v597, 0), v1769, + v597, 90); + svfloat32_t v606 = svadd_f32_x(svptrue_b32(), v1779, v38); + svfloat32_t v607 = svsub_f32_x(svptrue_b32(), v1779, v38); + svfloat32_t v608 = svadd_f32_x(svptrue_b32(), v52, v66); + svfloat32_t v609 = svsub_f32_x(svptrue_b32(), v52, v66); + svfloat32_t v621 = svadd_f32_x(svptrue_b32(), v101, v108); + svfloat32_t v622 = svsub_f32_x(svptrue_b32(), v101, v108); + svfloat32_t v623 = svadd_f32_x(svptrue_b32(), v143, v150); + svfloat32_t v624 = svsub_f32_x(svptrue_b32(), v143, v150); + svfloat32_t v677 = svadd_f32_x(svptrue_b32(), v185, v192); + svfloat32_t v678 = svsub_f32_x(svptrue_b32(), v185, v192); + svfloat32_t v679 = svadd_f32_x(svptrue_b32(), v206, v220); + svfloat32_t v680 = svsub_f32_x(svptrue_b32(), v206, v220); + svfloat32_t v692 = svadd_f32_x(svptrue_b32(), v255, v262); + svfloat32_t v693 = svsub_f32_x(svptrue_b32(), v255, v262); + svfloat32_t v694 = svadd_f32_x(svptrue_b32(), v276, v290); + svfloat32_t v695 = svsub_f32_x(svptrue_b32(), v276, v290); + svfloat32_t v837 = svadd_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v838 = svsub_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v839 = svadd_f32_x(svptrue_b32(), v346, v360); + svfloat32_t v840 = svsub_f32_x(svptrue_b32(), v346, v360); + svfloat32_t v852 = svadd_f32_x(svptrue_b32(), v395, v402); + svfloat32_t v853 = svsub_f32_x(svptrue_b32(), v395, v402); + svfloat32_t v854 = svadd_f32_x(svptrue_b32(), v437, v444); + svfloat32_t v855 = svsub_f32_x(svptrue_b32(), v437, v444); + svfloat32_t v908 = svadd_f32_x(svptrue_b32(), v479, v486); + svfloat32_t v909 = svsub_f32_x(svptrue_b32(), v479, v486); + svfloat32_t v910 = svadd_f32_x(svptrue_b32(), v500, v514); + svfloat32_t v911 = svsub_f32_x(svptrue_b32(), v500, v514); + svfloat32_t v923 = svadd_f32_x(svptrue_b32(), v549, v556); + svfloat32_t v924 = svsub_f32_x(svptrue_b32(), v549, v556); + svfloat32_t v925 = svadd_f32_x(svptrue_b32(), v591, v598); + svfloat32_t v926 = svsub_f32_x(svptrue_b32(), v591, v598); + svfloat32_t zero616 = svdup_n_f32(0); + svfloat32_t v616 = svcmla_f32_x(pred_full, zero616, v1983, v609, 90); + svfloat32_t v617 = svadd_f32_x(svptrue_b32(), v606, v608); + svfloat32_t v618 = svsub_f32_x(svptrue_b32(), v606, v608); + svfloat32_t v625 = svadd_f32_x(svptrue_b32(), v621, v623); + svfloat32_t v626 = svsub_f32_x(svptrue_b32(), v621, v623); + svfloat32_t v642 = svmul_f32_x(svptrue_b32(), v622, v1980); + svfloat32_t v654 = svmul_f32_x(svptrue_b32(), v624, v1982); + svfloat32_t zero687 = svdup_n_f32(0); + svfloat32_t v687 = svcmla_f32_x(pred_full, zero687, v1983, v680, 90); + svfloat32_t v688 = svadd_f32_x(svptrue_b32(), v677, v679); + svfloat32_t v689 = svsub_f32_x(svptrue_b32(), v677, v679); + svfloat32_t zero702 = svdup_n_f32(0); + svfloat32_t v702 = svcmla_f32_x(pred_full, zero702, v1983, v695, 90); + svfloat32_t v703 = svadd_f32_x(svptrue_b32(), v692, v694); + svfloat32_t v704 = svsub_f32_x(svptrue_b32(), v692, v694); + svfloat32_t zero847 = svdup_n_f32(0); + svfloat32_t v847 = svcmla_f32_x(pred_full, zero847, v1983, v840, 90); + svfloat32_t v848 = svadd_f32_x(svptrue_b32(), v837, v839); + svfloat32_t v849 = svsub_f32_x(svptrue_b32(), v837, v839); + svfloat32_t v856 = svadd_f32_x(svptrue_b32(), v852, v854); + svfloat32_t v857 = svsub_f32_x(svptrue_b32(), v852, v854); + svfloat32_t v873 = svmul_f32_x(svptrue_b32(), v853, v1980); + svfloat32_t v885 = svmul_f32_x(svptrue_b32(), v855, v1982); + svfloat32_t zero918 = svdup_n_f32(0); + svfloat32_t v918 = svcmla_f32_x(pred_full, zero918, v1983, v911, 90); + svfloat32_t v919 = svadd_f32_x(svptrue_b32(), v908, v910); + svfloat32_t v920 = svsub_f32_x(svptrue_b32(), v908, v910); + svfloat32_t v927 = svadd_f32_x(svptrue_b32(), v923, v925); + svfloat32_t v928 = svsub_f32_x(svptrue_b32(), v923, v925); + svfloat32_t v944 = svmul_f32_x(svptrue_b32(), v924, v1980); + svfloat32_t v956 = svmul_f32_x(svptrue_b32(), v926, v1982); + svfloat32_t v619 = svsub_f32_x(svptrue_b32(), v607, v616); + svfloat32_t v620 = svadd_f32_x(svptrue_b32(), v607, v616); + svfloat32_t zero633 = svdup_n_f32(0); + svfloat32_t v633 = svcmla_f32_x(pred_full, zero633, v1983, v626, 90); + svfloat32_t v634 = svadd_f32_x(svptrue_b32(), v617, v625); + svfloat32_t v635 = svsub_f32_x(svptrue_b32(), v617, v625); + svfloat32_t v690 = svsub_f32_x(svptrue_b32(), v678, v687); + svfloat32_t v691 = svadd_f32_x(svptrue_b32(), v678, v687); + svfloat32_t v705 = svsub_f32_x(svptrue_b32(), v693, v702); + svfloat32_t v706 = svadd_f32_x(svptrue_b32(), v693, v702); + svfloat32_t v707 = svadd_f32_x(svptrue_b32(), v688, v703); + svfloat32_t v708 = svsub_f32_x(svptrue_b32(), v688, v703); + svfloat32_t v763 = svmul_f32_x(svptrue_b32(), v689, v1980); + svfloat32_t v775 = svmul_f32_x(svptrue_b32(), v704, v1982); + svfloat32_t v850 = svsub_f32_x(svptrue_b32(), v838, v847); + svfloat32_t v851 = svadd_f32_x(svptrue_b32(), v838, v847); + svfloat32_t zero864 = svdup_n_f32(0); + svfloat32_t v864 = svcmla_f32_x(pred_full, zero864, v1983, v857, 90); + svfloat32_t v865 = svadd_f32_x(svptrue_b32(), v848, v856); + svfloat32_t v866 = svsub_f32_x(svptrue_b32(), v848, v856); + svfloat32_t v921 = svsub_f32_x(svptrue_b32(), v909, v918); + svfloat32_t v922 = svadd_f32_x(svptrue_b32(), v909, v918); + svfloat32_t zero935 = svdup_n_f32(0); + svfloat32_t v935 = svcmla_f32_x(pred_full, zero935, v1983, v928, 90); + svfloat32_t v936 = svadd_f32_x(svptrue_b32(), v919, v927); + svfloat32_t v937 = svsub_f32_x(svptrue_b32(), v919, v927); + svfloat32_t v636 = svsub_f32_x(svptrue_b32(), v618, v633); + svfloat32_t v637 = svadd_f32_x(svptrue_b32(), v618, v633); + svfloat32_t v662 = svcmla_f32_x(pred_full, v642, v2107, v642, 90); + svfloat32_t v663 = svcmla_f32_x(pred_full, v654, v1983, v654, 90); + svfloat32_t zero715 = svdup_n_f32(0); + svfloat32_t v715 = svcmla_f32_x(pred_full, zero715, v1983, v708, 90); + svfloat32_t v716 = svadd_f32_x(svptrue_b32(), v634, v707); + svfloat32_t v717 = svsub_f32_x(svptrue_b32(), v634, v707); + svfloat32_t v724 = svmul_f32_x(svptrue_b32(), v690, v1898); + svfloat32_t v736 = svmul_f32_x(svptrue_b32(), v705, v2062); + svfloat32_t v802 = svmul_f32_x(svptrue_b32(), v691, v2062); + svfloat32_t v814 = svmul_f32_x(svptrue_b32(), v706, v2064); + svfloat32_t v867 = svsub_f32_x(svptrue_b32(), v849, v864); + svfloat32_t v868 = svadd_f32_x(svptrue_b32(), v849, v864); + svfloat32_t v893 = svcmla_f32_x(pred_full, v873, v2107, v873, 90); + svfloat32_t v894 = svcmla_f32_x(pred_full, v885, v1983, v885, 90); + svfloat32_t v938 = svsub_f32_x(svptrue_b32(), v920, v935); + svfloat32_t v939 = svadd_f32_x(svptrue_b32(), v920, v935); + svfloat32_t v964 = svcmla_f32_x(pred_full, v944, v2107, v944, 90); + svfloat32_t v965 = svcmla_f32_x(pred_full, v956, v1983, v956, 90); + svfloat32_t v979 = svadd_f32_x(svptrue_b32(), v865, v936); + svfloat32_t v980 = svsub_f32_x(svptrue_b32(), v865, v936); + svfloat32_t v1225 = svmul_f32_x(svptrue_b32(), v866, v1980); + svfloat32_t v1237 = svmul_f32_x(svptrue_b32(), v937, v1982); + svfloat32_t v664 = svadd_f32_x(svptrue_b32(), v662, v663); + svfloat32_t v665 = svsub_f32_x(svptrue_b32(), v663, v662); + svfloat32_t v718 = svsub_f32_x(svptrue_b32(), v635, v715); + svfloat32_t v719 = svadd_f32_x(svptrue_b32(), v635, v715); + svfloat32_t v744 = svcmla_f32_x(pred_full, v724, v1899, v690, 90); + svfloat32_t v745 = svcmla_f32_x(pred_full, v736, v2063, v705, 90); + svfloat32_t v783 = svcmla_f32_x(pred_full, v763, v2107, v763, 90); + svfloat32_t v784 = svcmla_f32_x(pred_full, v775, v1983, v775, 90); + svfloat32_t v822 = svcmla_f32_x(pred_full, v802, v2063, v691, 90); + svfloat32_t v823 = svcmla_f32_x(pred_full, v814, v2065, v706, 90); + svfloat32_t v895 = svadd_f32_x(svptrue_b32(), v893, v894); + svfloat32_t v896 = svsub_f32_x(svptrue_b32(), v894, v893); + svfloat32_t v966 = svadd_f32_x(svptrue_b32(), v964, v965); + svfloat32_t v967 = svsub_f32_x(svptrue_b32(), v965, v964); + svfloat32_t zero987 = svdup_n_f32(0); + svfloat32_t v987 = svcmla_f32_x(pred_full, zero987, v1983, v980, 90); + svfloat32_t v988 = svadd_f32_x(svptrue_b32(), v716, v979); + svfloat32_t v989 = svsub_f32_x(svptrue_b32(), v716, v979); + svfloat32_t v1091 = svmul_f32_x(svptrue_b32(), v867, v1898); + svfloat32_t v1103 = svmul_f32_x(svptrue_b32(), v938, v2062); + svfloat32_t v1359 = svmul_f32_x(svptrue_b32(), v868, v2062); + svfloat32_t v1371 = svmul_f32_x(svptrue_b32(), v939, v2064); + svfloat32_t zero672 = svdup_n_f32(0); + svfloat32_t v672 = svcmla_f32_x(pred_full, zero672, v2107, v665, 90); + svfloat32_t v673 = svadd_f32_x(svptrue_b32(), v619, v664); + svfloat32_t v674 = svsub_f32_x(svptrue_b32(), v619, v664); + svfloat32_t v746 = svadd_f32_x(svptrue_b32(), v744, v745); + svfloat32_t v747 = svsub_f32_x(svptrue_b32(), v745, v744); + svfloat32_t v785 = svadd_f32_x(svptrue_b32(), v783, v784); + svfloat32_t v786 = svsub_f32_x(svptrue_b32(), v784, v783); + svfloat32_t v824 = svadd_f32_x(svptrue_b32(), v822, v823); + svfloat32_t v825 = svsub_f32_x(svptrue_b32(), v823, v822); + svfloat32_t zero903 = svdup_n_f32(0); + svfloat32_t v903 = svcmla_f32_x(pred_full, zero903, v2107, v896, 90); + svfloat32_t v904 = svadd_f32_x(svptrue_b32(), v850, v895); + svfloat32_t v905 = svsub_f32_x(svptrue_b32(), v850, v895); + svfloat32_t zero974 = svdup_n_f32(0); + svfloat32_t v974 = svcmla_f32_x(pred_full, zero974, v2107, v967, 90); + svfloat32_t v975 = svadd_f32_x(svptrue_b32(), v921, v966); + svfloat32_t v976 = svsub_f32_x(svptrue_b32(), v921, v966); + svfloat32_t v990 = svsub_f32_x(svptrue_b32(), v717, v987); + svfloat32_t v991 = svadd_f32_x(svptrue_b32(), v717, v987); + svfloat32_t v1111 = svcmla_f32_x(pred_full, v1091, v1899, v867, 90); + svfloat32_t v1112 = svcmla_f32_x(pred_full, v1103, v2063, v938, 90); + svfloat32_t v1245 = svcmla_f32_x(pred_full, v1225, v2107, v1225, 90); + svfloat32_t v1246 = svcmla_f32_x(pred_full, v1237, v1983, v1237, 90); + svfloat32_t v1379 = svcmla_f32_x(pred_full, v1359, v2063, v868, 90); + svfloat32_t v1380 = svcmla_f32_x(pred_full, v1371, v2065, v939, 90); + svst1_scatter_s64index_f64(pred_full, (double *)(v1827), v2142, + svreinterpret_f64_f32(v988)); + svst1_scatter_s64index_f64(pred_full, (double *)(v1845), v2142, + svreinterpret_f64_f32(v989)); + svfloat32_t v675 = svsub_f32_x(svptrue_b32(), v620, v672); + svfloat32_t v676 = svadd_f32_x(svptrue_b32(), v620, v672); + svfloat32_t zero754 = svdup_n_f32(0); + svfloat32_t v754 = svcmla_f32_x(pred_full, zero754, v2107, v747, 90); + svfloat32_t v755 = svadd_f32_x(svptrue_b32(), v673, v746); + svfloat32_t v756 = svsub_f32_x(svptrue_b32(), v673, v746); + svfloat32_t zero793 = svdup_n_f32(0); + svfloat32_t v793 = svcmla_f32_x(pred_full, zero793, v2107, v786, 90); + svfloat32_t v794 = svadd_f32_x(svptrue_b32(), v636, v785); + svfloat32_t v795 = svsub_f32_x(svptrue_b32(), v636, v785); + svfloat32_t zero832 = svdup_n_f32(0); + svfloat32_t v832 = svcmla_f32_x(pred_full, zero832, v2107, v825, 90); + svfloat32_t v906 = svsub_f32_x(svptrue_b32(), v851, v903); + svfloat32_t v907 = svadd_f32_x(svptrue_b32(), v851, v903); + svfloat32_t v977 = svsub_f32_x(svptrue_b32(), v922, v974); + svfloat32_t v978 = svadd_f32_x(svptrue_b32(), v922, v974); + svfloat32_t v1024 = svmul_f32_x(svptrue_b32(), v904, v1857); + svfloat32_t v1036 = svmul_f32_x(svptrue_b32(), v975, v1939); + svfloat32_t v1113 = svadd_f32_x(svptrue_b32(), v1111, v1112); + svfloat32_t v1114 = svsub_f32_x(svptrue_b32(), v1112, v1111); + svfloat32_t v1247 = svadd_f32_x(svptrue_b32(), v1245, v1246); + svfloat32_t v1248 = svsub_f32_x(svptrue_b32(), v1246, v1245); + svfloat32_t v1292 = svmul_f32_x(svptrue_b32(), v905, v2021); + svfloat32_t v1304 = svmul_f32_x(svptrue_b32(), v976, v2023); + svfloat32_t v1381 = svadd_f32_x(svptrue_b32(), v1379, v1380); + svfloat32_t v1382 = svsub_f32_x(svptrue_b32(), v1380, v1379); + svst1_scatter_s64index_f64(pred_full, (double *)(v1836), v2142, + svreinterpret_f64_f32(v990)); + svst1_scatter_s64index_f64(pred_full, (double *)(v1854), v2142, + svreinterpret_f64_f32(v991)); + svfloat32_t v757 = svsub_f32_x(svptrue_b32(), v674, v754); + svfloat32_t v758 = svadd_f32_x(svptrue_b32(), v674, v754); + svfloat32_t v796 = svsub_f32_x(svptrue_b32(), v637, v793); + svfloat32_t v797 = svadd_f32_x(svptrue_b32(), v637, v793); + svfloat32_t v833 = svadd_f32_x(svptrue_b32(), v675, v824); + svfloat32_t v834 = svsub_f32_x(svptrue_b32(), v675, v824); + svfloat32_t v835 = svsub_f32_x(svptrue_b32(), v676, v832); + svfloat32_t v836 = svadd_f32_x(svptrue_b32(), v676, v832); + svfloat32_t v1044 = svcmla_f32_x(pred_full, v1024, v2024, v904, 90); + svfloat32_t v1045 = svcmla_f32_x(pred_full, v1036, v1940, v975, 90); + svfloat32_t zero1121 = svdup_n_f32(0); + svfloat32_t v1121 = svcmla_f32_x(pred_full, zero1121, v2107, v1114, 90); + svfloat32_t v1122 = svadd_f32_x(svptrue_b32(), v794, v1113); + svfloat32_t v1123 = svsub_f32_x(svptrue_b32(), v794, v1113); + svfloat32_t v1158 = svmul_f32_x(svptrue_b32(), v906, v1939); + svfloat32_t v1170 = svmul_f32_x(svptrue_b32(), v977, v1941); + svfloat32_t zero1255 = svdup_n_f32(0); + svfloat32_t v1255 = svcmla_f32_x(pred_full, zero1255, v2107, v1248, 90); + svfloat32_t v1256 = svadd_f32_x(svptrue_b32(), v718, v1247); + svfloat32_t v1257 = svsub_f32_x(svptrue_b32(), v718, v1247); + svfloat32_t v1312 = svcmla_f32_x(pred_full, v1292, v2022, v905, 90); + svfloat32_t v1313 = svcmla_f32_x(pred_full, v1304, v2024, v976, 90); + svfloat32_t zero1389 = svdup_n_f32(0); + svfloat32_t v1389 = svcmla_f32_x(pred_full, zero1389, v2107, v1382, 90); + svfloat32_t v1426 = svmul_f32_x(svptrue_b32(), v907, v2103); + svfloat32_t v1438 = svmul_f32_x(svptrue_b32(), v978, v2105); + svfloat32_t v1046 = svadd_f32_x(svptrue_b32(), v1044, v1045); + svfloat32_t v1047 = svsub_f32_x(svptrue_b32(), v1045, v1044); + svfloat32_t v1124 = svsub_f32_x(svptrue_b32(), v795, v1121); + svfloat32_t v1125 = svadd_f32_x(svptrue_b32(), v795, v1121); + svfloat32_t v1178 = svcmla_f32_x(pred_full, v1158, v1940, v906, 90); + svfloat32_t v1179 = svcmla_f32_x(pred_full, v1170, v2104, v977, 90); + svfloat32_t v1258 = svsub_f32_x(svptrue_b32(), v719, v1255); + svfloat32_t v1259 = svadd_f32_x(svptrue_b32(), v719, v1255); + svfloat32_t v1314 = svadd_f32_x(svptrue_b32(), v1312, v1313); + svfloat32_t v1315 = svsub_f32_x(svptrue_b32(), v1313, v1312); + svfloat32_t v1390 = svadd_f32_x(svptrue_b32(), v796, v1381); + svfloat32_t v1391 = svsub_f32_x(svptrue_b32(), v796, v1381); + svfloat32_t v1392 = svsub_f32_x(svptrue_b32(), v797, v1389); + svfloat32_t v1393 = svadd_f32_x(svptrue_b32(), v797, v1389); + svfloat32_t v1446 = svcmla_f32_x(pred_full, v1426, v2104, v907, 90); + svfloat32_t v1447 = svcmla_f32_x(pred_full, v1438, v2106, v978, 90); + svst1_scatter_s64index_f64(pred_full, (double *)(v1909), v2142, + svreinterpret_f64_f32(v1122)); + svst1_scatter_s64index_f64(pred_full, (double *)(v1927), v2142, + svreinterpret_f64_f32(v1123)); + svst1_scatter_s64index_f64(pred_full, (double *)(v1991), v2142, + svreinterpret_f64_f32(v1256)); + svst1_scatter_s64index_f64(pred_full, (double *)(v2009), v2142, + svreinterpret_f64_f32(v1257)); + svfloat32_t zero1054 = svdup_n_f32(0); + svfloat32_t v1054 = svcmla_f32_x(pred_full, zero1054, v2107, v1047, 90); + svfloat32_t v1055 = svadd_f32_x(svptrue_b32(), v755, v1046); + svfloat32_t v1056 = svsub_f32_x(svptrue_b32(), v755, v1046); + svfloat32_t v1180 = svadd_f32_x(svptrue_b32(), v1178, v1179); + svfloat32_t v1181 = svsub_f32_x(svptrue_b32(), v1179, v1178); + svfloat32_t zero1322 = svdup_n_f32(0); + svfloat32_t v1322 = svcmla_f32_x(pred_full, zero1322, v2107, v1315, 90); + svfloat32_t v1323 = svadd_f32_x(svptrue_b32(), v757, v1314); + svfloat32_t v1324 = svsub_f32_x(svptrue_b32(), v757, v1314); + svfloat32_t v1448 = svadd_f32_x(svptrue_b32(), v1446, v1447); + svfloat32_t v1449 = svsub_f32_x(svptrue_b32(), v1447, v1446); + svst1_scatter_s64index_f64(pred_full, (double *)(v1918), v2142, + svreinterpret_f64_f32(v1124)); + svst1_scatter_s64index_f64(pred_full, (double *)(v1936), v2142, + svreinterpret_f64_f32(v1125)); + svst1_scatter_s64index_f64(pred_full, (double *)(v2000), v2142, + svreinterpret_f64_f32(v1258)); + svst1_scatter_s64index_f64(pred_full, (double *)(v2018), v2142, + svreinterpret_f64_f32(v1259)); + svst1_scatter_s64index_f64(pred_full, (double *)(v2073), v2142, + svreinterpret_f64_f32(v1390)); + svst1_scatter_s64index_f64(pred_full, (double *)(v2082), v2142, + svreinterpret_f64_f32(v1392)); + svst1_scatter_s64index_f64(pred_full, (double *)(v2091), v2142, + svreinterpret_f64_f32(v1391)); + svst1_scatter_s64index_f64(pred_full, (double *)(v2100), v2142, + svreinterpret_f64_f32(v1393)); + svfloat32_t v1057 = svsub_f32_x(svptrue_b32(), v756, v1054); + svfloat32_t v1058 = svadd_f32_x(svptrue_b32(), v756, v1054); + svfloat32_t zero1188 = svdup_n_f32(0); + svfloat32_t v1188 = svcmla_f32_x(pred_full, zero1188, v2107, v1181, 90); + svfloat32_t v1189 = svadd_f32_x(svptrue_b32(), v833, v1180); + svfloat32_t v1190 = svsub_f32_x(svptrue_b32(), v833, v1180); + svfloat32_t v1325 = svsub_f32_x(svptrue_b32(), v758, v1322); + svfloat32_t v1326 = svadd_f32_x(svptrue_b32(), v758, v1322); + svfloat32_t zero1456 = svdup_n_f32(0); + svfloat32_t v1456 = svcmla_f32_x(pred_full, zero1456, v2107, v1449, 90); + svfloat32_t v1457 = svadd_f32_x(svptrue_b32(), v835, v1448); + svfloat32_t v1458 = svsub_f32_x(svptrue_b32(), v835, v1448); + svst1_scatter_s64index_f64(pred_full, (double *)(v1868), v2142, + svreinterpret_f64_f32(v1055)); + svst1_scatter_s64index_f64(pred_full, (double *)(v1886), v2142, + svreinterpret_f64_f32(v1056)); + svst1_scatter_s64index_f64(pred_full, (double *)(v2032), v2142, + svreinterpret_f64_f32(v1323)); + svst1_scatter_s64index_f64(pred_full, (double *)(v2050), v2142, + svreinterpret_f64_f32(v1324)); + svfloat32_t v1191 = svsub_f32_x(svptrue_b32(), v834, v1188); + svfloat32_t v1192 = svadd_f32_x(svptrue_b32(), v834, v1188); + svfloat32_t v1459 = svsub_f32_x(svptrue_b32(), v836, v1456); + svfloat32_t v1460 = svadd_f32_x(svptrue_b32(), v836, v1456); + svst1_scatter_s64index_f64(pred_full, (double *)(v1877), v2142, + svreinterpret_f64_f32(v1057)); + svst1_scatter_s64index_f64(pred_full, (double *)(v1895), v2142, + svreinterpret_f64_f32(v1058)); + svst1_scatter_s64index_f64(pred_full, (double *)(v1950), v2142, + svreinterpret_f64_f32(v1189)); + svst1_scatter_s64index_f64(pred_full, (double *)(v1968), v2142, + svreinterpret_f64_f32(v1190)); + svst1_scatter_s64index_f64(pred_full, (double *)(v2041), v2142, + svreinterpret_f64_f32(v1325)); + svst1_scatter_s64index_f64(pred_full, (double *)(v2059), v2142, + svreinterpret_f64_f32(v1326)); + svst1_scatter_s64index_f64(pred_full, (double *)(v2114), v2142, + svreinterpret_f64_f32(v1457)); + svst1_scatter_s64index_f64(pred_full, (double *)(v2132), v2142, + svreinterpret_f64_f32(v1458)); + svst1_scatter_s64index_f64(pred_full, (double *)(v1959), v2142, + svreinterpret_f64_f32(v1191)); + svst1_scatter_s64index_f64(pred_full, (double *)(v1977), v2142, + svreinterpret_f64_f32(v1192)); + svst1_scatter_s64index_f64(pred_full, (double *)(v2123), v2142, + svreinterpret_f64_f32(v1459)); + svst1_scatter_s64index_f64(pred_full, (double *)(v2141), v2142, + svreinterpret_f64_f32(v1460)); + v5 += v11; + v6 += v12; + } +} +#endif diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.h b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.h index b66da6b54073abdfe2c5d6ad044c99534a44988c..4e6080ac3ef6e4b81e7d24c33cd355f84062591a 100644 --- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.h +++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.h @@ -43,6 +43,7 @@ cf32_cf32_cf32_ab_t_gs_fft_t armral_fft_cf32_cf32_cf32_ab_t_gs21; cf32_cf32_cf32_ab_t_gs_fft_t armral_fft_cf32_cf32_cf32_ab_t_gs22; cf32_cf32_cf32_ab_t_gs_fft_t armral_fft_cf32_cf32_cf32_ab_t_gs24; cf32_cf32_cf32_ab_t_gs_fft_t armral_fft_cf32_cf32_cf32_ab_t_gs25; +cf32_cf32_cf32_ab_t_gs_fft_t armral_fft_cf32_cf32_cf32_ab_t_gs32; #ifdef __cplusplus } // extern "C" diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c index da66b3f292873af7bbca7297a14bd4f942eea34b..4bd9bd8a95b3d89f97d1d438658ead90a99d988b 100644 --- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c +++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c @@ -11,6 +11,984 @@ #include #endif +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_gu7(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v20 = v5[istride]; + float v73 = -1.1666666666666665e+00F; + float v77 = 7.9015646852540022e-01F; + float v81 = 5.5854267289647742e-02F; + float v85 = 7.3430220123575241e-01F; + float v88 = 4.4095855184409838e-01F; + float v89 = -4.4095855184409838e-01F; + float v95 = 3.4087293062393137e-01F; + float v96 = -3.4087293062393137e-01F; + float v102 = -5.3396936033772524e-01F; + float v103 = 5.3396936033772524e-01F; + float v109 = 8.7484229096165667e-01F; + float v110 = -8.7484229096165667e-01F; + float32x2_t v112 = (float32x2_t){v4, v4}; + float32x2_t v58 = v5[0]; + float32x2_t v74 = (float32x2_t){v73, v73}; + float32x2_t v78 = (float32x2_t){v77, v77}; + float32x2_t v82 = (float32x2_t){v81, v81}; + float32x2_t v86 = (float32x2_t){v85, v85}; + float32x2_t v90 = (float32x2_t){v88, v89}; + float32x2_t v97 = (float32x2_t){v95, v96}; + float32x2_t v104 = (float32x2_t){v102, v103}; + float32x2_t v111 = (float32x2_t){v109, v110}; + float32x2_t v25 = v5[istride * 6]; + float32x2_t v32 = v5[istride * 4]; + float32x2_t v37 = v5[istride * 3]; + float32x2_t v44 = v5[istride * 2]; + float32x2_t v49 = v5[istride * 5]; + float32x2_t v92 = vmul_f32(v112, v90); + float32x2_t v99 = vmul_f32(v112, v97); + float32x2_t v106 = vmul_f32(v112, v104); + float32x2_t v113 = vmul_f32(v112, v111); + float32x2_t v26 = vadd_f32(v20, v25); + float32x2_t v27 = vsub_f32(v20, v25); + float32x2_t v38 = vadd_f32(v32, v37); + float32x2_t v39 = vsub_f32(v32, v37); + float32x2_t v50 = vadd_f32(v44, v49); + float32x2_t v51 = vsub_f32(v44, v49); + float32x2_t v52 = vadd_f32(v26, v38); + float32x2_t v60 = vsub_f32(v26, v38); + float32x2_t v61 = vsub_f32(v38, v50); + float32x2_t v62 = vsub_f32(v50, v26); + float32x2_t v63 = vadd_f32(v27, v39); + float32x2_t v65 = vsub_f32(v27, v39); + float32x2_t v66 = vsub_f32(v39, v51); + float32x2_t v67 = vsub_f32(v51, v27); + float32x2_t v53 = vadd_f32(v52, v50); + float32x2_t v64 = vadd_f32(v63, v51); + float32x2_t v79 = vmul_f32(v60, v78); + float32x2_t v83 = vmul_f32(v61, v82); + float32x2_t v87 = vmul_f32(v62, v86); + float32x2_t v100 = vrev64_f32(v65); + float32x2_t v107 = vrev64_f32(v66); + float32x2_t v114 = vrev64_f32(v67); + float32x2_t v59 = vadd_f32(v53, v58); + float32x2_t v75 = vmul_f32(v53, v74); + float32x2_t v93 = vrev64_f32(v64); + float32x2_t v101 = vmul_f32(v100, v99); + float32x2_t v108 = vmul_f32(v107, v106); + float32x2_t v115 = vmul_f32(v114, v113); + float32x2_t v94 = vmul_f32(v93, v92); + float32x2_t v116 = vadd_f32(v59, v75); + v6[0] = v59; + float32x2_t v117 = vadd_f32(v116, v79); + float32x2_t v119 = vsub_f32(v116, v79); + float32x2_t v121 = vsub_f32(v116, v83); + float32x2_t v123 = vadd_f32(v94, v101); + float32x2_t v125 = vsub_f32(v94, v101); + float32x2_t v127 = vsub_f32(v94, v108); + float32x2_t v118 = vadd_f32(v117, v83); + float32x2_t v120 = vsub_f32(v119, v87); + float32x2_t v122 = vadd_f32(v121, v87); + float32x2_t v124 = vadd_f32(v123, v108); + float32x2_t v126 = vsub_f32(v125, v115); + float32x2_t v128 = vadd_f32(v127, v115); + float32x2_t v129 = vadd_f32(v118, v124); + float32x2_t v130 = vsub_f32(v118, v124); + float32x2_t v131 = vadd_f32(v120, v126); + float32x2_t v132 = vsub_f32(v120, v126); + float32x2_t v133 = vadd_f32(v122, v128); + float32x2_t v134 = vsub_f32(v122, v128); + v6[ostride] = v130; + v6[ostride * 2] = v132; + v6[ostride * 3] = v133; + v6[ostride * 4] = v134; + v6[ostride * 5] = v131; + v6[ostride * 6] = v129; + v5 += 1 * idist; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_gu7(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v90 = -1.1666666666666665e+00F; + float v95 = 7.9015646852540022e-01F; + float v100 = 5.5854267289647742e-02F; + float v105 = 7.3430220123575241e-01F; + float v110 = -4.4095855184409838e-01F; + float v117 = -3.4087293062393137e-01F; + float v124 = 5.3396936033772524e-01F; + float v131 = -8.7484229096165667e-01F; + const float32x2_t *v211 = &v5[v0]; + float32x2_t *v294 = &v6[v2]; + int64_t v26 = v0 * 6; + int64_t v35 = v0 * 4; + int64_t v42 = v0 * 3; + int64_t v51 = v0 * 2; + int64_t v58 = v0 * 5; + float v113 = v4 * v110; + float v120 = v4 * v117; + float v127 = v4 * v124; + float v134 = v4 * v131; + int64_t v171 = v2 * 2; + int64_t v178 = v2 * 3; + int64_t v185 = v2 * 4; + int64_t v192 = v2 * 5; + int64_t v199 = v2 * 6; + const float32x2_t *v266 = &v5[0]; + svint64_t v267 = svindex_s64(0, v1); + svfloat32_t v270 = svdup_n_f32(v90); + svfloat32_t v271 = svdup_n_f32(v95); + svfloat32_t v272 = svdup_n_f32(v100); + svfloat32_t v273 = svdup_n_f32(v105); + float32x2_t *v285 = &v6[0]; + svfloat32_t v213 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v211), v267)); + const float32x2_t *v220 = &v5[v26]; + const float32x2_t *v229 = &v5[v35]; + const float32x2_t *v238 = &v5[v42]; + const float32x2_t *v247 = &v5[v51]; + const float32x2_t *v256 = &v5[v58]; + svfloat32_t v268 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v266), v267)); + svfloat32_t v274 = svdup_n_f32(v113); + svfloat32_t v275 = svdup_n_f32(v120); + svfloat32_t v276 = svdup_n_f32(v127); + svfloat32_t v277 = svdup_n_f32(v134); + float32x2_t *v303 = &v6[v171]; + float32x2_t *v312 = &v6[v178]; + float32x2_t *v321 = &v6[v185]; + float32x2_t *v330 = &v6[v192]; + float32x2_t *v339 = &v6[v199]; + svfloat32_t v222 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v220), v267)); + svfloat32_t v231 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v229), v267)); + svfloat32_t v240 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v238), v267)); + svfloat32_t v249 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v247), v267)); + svfloat32_t v258 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v256), v267)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v213, v222); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v213, v222); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v231, v240); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v231, v240); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v249, v258); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v249, v258); + svfloat32_t v66 = svadd_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v76 = svsub_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v77 = svsub_f32_x(svptrue_b32(), v48, v64); + svfloat32_t v78 = svsub_f32_x(svptrue_b32(), v64, v32); + svfloat32_t v79 = svadd_f32_x(svptrue_b32(), v33, v49); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v33, v49); + svfloat32_t v82 = svsub_f32_x(svptrue_b32(), v49, v65); + svfloat32_t v83 = svsub_f32_x(svptrue_b32(), v65, v33); + svfloat32_t v67 = svadd_f32_x(svptrue_b32(), v66, v64); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v79, v65); + svfloat32_t zero122 = svdup_n_f32(0); + svfloat32_t v122 = svcmla_f32_x(pred_full, zero122, v275, v81, 90); + svfloat32_t zero129 = svdup_n_f32(0); + svfloat32_t v129 = svcmla_f32_x(pred_full, zero129, v276, v82, 90); + svfloat32_t zero136 = svdup_n_f32(0); + svfloat32_t v136 = svcmla_f32_x(pred_full, zero136, v277, v83, 90); + svfloat32_t v75 = svadd_f32_x(svptrue_b32(), v67, v268); + svfloat32_t zero115 = svdup_n_f32(0); + svfloat32_t v115 = svcmla_f32_x(pred_full, zero115, v274, v80, 90); + svfloat32_t v137 = svmla_f32_x(pred_full, v75, v67, v270); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v146 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v148 = svsub_f32_x(svptrue_b32(), v115, v129); + svst1_f64(pred_full, (double *)(v285), svreinterpret_f64_f32(v75)); + svfloat32_t v138 = svmla_f32_x(pred_full, v137, v76, v271); + svfloat32_t v140 = svmls_f32_x(pred_full, v137, v76, v271); + svfloat32_t v142 = svmls_f32_x(pred_full, v137, v77, v272); + svfloat32_t v145 = svadd_f32_x(svptrue_b32(), v144, v129); + svfloat32_t v147 = svsub_f32_x(svptrue_b32(), v146, v136); + svfloat32_t v149 = svadd_f32_x(svptrue_b32(), v148, v136); + svfloat32_t v139 = svmla_f32_x(pred_full, v138, v77, v272); + svfloat32_t v141 = svmls_f32_x(pred_full, v140, v78, v273); + svfloat32_t v143 = svmla_f32_x(pred_full, v142, v78, v273); + svfloat32_t v150 = svadd_f32_x(svptrue_b32(), v139, v145); + svfloat32_t v151 = svsub_f32_x(svptrue_b32(), v139, v145); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v141, v147); + svfloat32_t v153 = svsub_f32_x(svptrue_b32(), v141, v147); + svfloat32_t v154 = svadd_f32_x(svptrue_b32(), v143, v149); + svfloat32_t v155 = svsub_f32_x(svptrue_b32(), v143, v149); + svst1_f64(pred_full, (double *)(v294), svreinterpret_f64_f32(v151)); + svst1_f64(pred_full, (double *)(v303), svreinterpret_f64_f32(v153)); + svst1_f64(pred_full, (double *)(v312), svreinterpret_f64_f32(v154)); + svst1_f64(pred_full, (double *)(v321), svreinterpret_f64_f32(v155)); + svst1_f64(pred_full, (double *)(v330), svreinterpret_f64_f32(v152)); + svst1_f64(pred_full, (double *)(v339), svreinterpret_f64_f32(v150)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_gu9(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v20 = v5[istride]; + float v86 = -5.0000000000000000e-01F; + float v97 = -1.4999999999999998e+00F; + float v100 = 8.6602540378443871e-01F; + float v101 = -8.6602540378443871e-01F; + float v108 = 7.6604444311897801e-01F; + float v112 = 9.3969262078590832e-01F; + float v116 = -1.7364817766693039e-01F; + float v119 = 6.4278760968653925e-01F; + float v120 = -6.4278760968653925e-01F; + float v126 = -3.4202014332566888e-01F; + float v127 = 3.4202014332566888e-01F; + float v133 = 9.8480775301220802e-01F; + float v134 = -9.8480775301220802e-01F; + float32x2_t v136 = (float32x2_t){v4, v4}; + float32x2_t v71 = v5[0]; + float32x2_t v87 = (float32x2_t){v86, v86}; + float32x2_t v98 = (float32x2_t){v97, v97}; + float32x2_t v102 = (float32x2_t){v100, v101}; + float32x2_t v109 = (float32x2_t){v108, v108}; + float32x2_t v113 = (float32x2_t){v112, v112}; + float32x2_t v117 = (float32x2_t){v116, v116}; + float32x2_t v121 = (float32x2_t){v119, v120}; + float32x2_t v128 = (float32x2_t){v126, v127}; + float32x2_t v135 = (float32x2_t){v133, v134}; + float32x2_t v25 = v5[istride * 8]; + float32x2_t v32 = v5[istride * 7]; + float32x2_t v37 = v5[istride * 2]; + float32x2_t v44 = v5[istride * 3]; + float32x2_t v49 = v5[istride * 6]; + float32x2_t v56 = v5[istride * 4]; + float32x2_t v61 = v5[istride * 5]; + float32x2_t v104 = vmul_f32(v136, v102); + float32x2_t v123 = vmul_f32(v136, v121); + float32x2_t v130 = vmul_f32(v136, v128); + float32x2_t v137 = vmul_f32(v136, v135); + float32x2_t v26 = vadd_f32(v20, v25); + float32x2_t v27 = vsub_f32(v20, v25); + float32x2_t v38 = vadd_f32(v32, v37); + float32x2_t v39 = vsub_f32(v32, v37); + float32x2_t v50 = vadd_f32(v44, v49); + float32x2_t v51 = vsub_f32(v44, v49); + float32x2_t v62 = vadd_f32(v56, v61); + float32x2_t v63 = vsub_f32(v56, v61); + float32x2_t v64 = vadd_f32(v26, v38); + float32x2_t v73 = vadd_f32(v27, v39); + float32x2_t v75 = vsub_f32(v26, v38); + float32x2_t v76 = vsub_f32(v38, v62); + float32x2_t v77 = vsub_f32(v62, v26); + float32x2_t v78 = vsub_f32(v27, v39); + float32x2_t v79 = vsub_f32(v39, v63); + float32x2_t v80 = vsub_f32(v63, v27); + float32x2_t v99 = vmul_f32(v50, v98); + float32x2_t v105 = vrev64_f32(v51); + float32x2_t v65 = vadd_f32(v64, v62); + float32x2_t v74 = vadd_f32(v73, v63); + float32x2_t v106 = vmul_f32(v105, v104); + float32x2_t v110 = vmul_f32(v75, v109); + float32x2_t v114 = vmul_f32(v76, v113); + float32x2_t v118 = vmul_f32(v77, v117); + float32x2_t v124 = vrev64_f32(v78); + float32x2_t v131 = vrev64_f32(v79); + float32x2_t v138 = vrev64_f32(v80); + float32x2_t v66 = vadd_f32(v65, v50); + float32x2_t v88 = vmul_f32(v65, v87); + float32x2_t v94 = vrev64_f32(v74); + float32x2_t v125 = vmul_f32(v124, v123); + float32x2_t v132 = vmul_f32(v131, v130); + float32x2_t v139 = vmul_f32(v138, v137); + float32x2_t v72 = vadd_f32(v66, v71); + float32x2_t v95 = vmul_f32(v94, v104); + float32x2_t v140 = vadd_f32(v88, v88); + float32x2_t v153 = vadd_f32(v106, v125); + float32x2_t v155 = vsub_f32(v106, v132); + float32x2_t v157 = vsub_f32(v106, v125); + float32x2_t v141 = vadd_f32(v140, v88); + float32x2_t v145 = vadd_f32(v72, v99); + float32x2_t v154 = vadd_f32(v153, v132); + float32x2_t v156 = vadd_f32(v155, v139); + float32x2_t v158 = vsub_f32(v157, v139); + v6[0] = v72; + float32x2_t v142 = vadd_f32(v72, v141); + float32x2_t v146 = vadd_f32(v145, v140); + float32x2_t v143 = vadd_f32(v142, v95); + float32x2_t v144 = vsub_f32(v142, v95); + float32x2_t v147 = vadd_f32(v146, v110); + float32x2_t v149 = vsub_f32(v146, v114); + float32x2_t v151 = vsub_f32(v146, v110); + float32x2_t v148 = vadd_f32(v147, v114); + float32x2_t v150 = vadd_f32(v149, v118); + float32x2_t v152 = vsub_f32(v151, v118); + v6[ostride * 3] = v144; + v6[ostride * 6] = v143; + float32x2_t v159 = vadd_f32(v148, v154); + float32x2_t v160 = vsub_f32(v148, v154); + float32x2_t v161 = vadd_f32(v150, v156); + float32x2_t v162 = vsub_f32(v150, v156); + float32x2_t v163 = vadd_f32(v152, v158); + float32x2_t v164 = vsub_f32(v152, v158); + v6[ostride] = v160; + v6[ostride * 2] = v161; + v6[ostride * 4] = v164; + v6[ostride * 5] = v163; + v6[ostride * 7] = v162; + v6[ostride * 8] = v159; + v5 += 1 * idist; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_gu9(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v107 = -5.0000000000000000e-01F; + float v119 = -1.4999999999999998e+00F; + float v124 = -8.6602540378443871e-01F; + float v131 = 7.6604444311897801e-01F; + float v136 = 9.3969262078590832e-01F; + float v141 = -1.7364817766693039e-01F; + float v146 = -6.4278760968653925e-01F; + float v153 = 3.4202014332566888e-01F; + float v160 = -9.8480775301220802e-01F; + const float32x2_t *v260 = &v5[v0]; + float32x2_t *v363 = &v6[v2]; + int64_t v26 = v0 * 8; + int64_t v35 = v0 * 7; + int64_t v42 = v0 * 2; + int64_t v51 = v0 * 3; + int64_t v58 = v0 * 6; + int64_t v67 = v0 * 4; + int64_t v74 = v0 * 5; + float v127 = v4 * v124; + float v149 = v4 * v146; + float v156 = v4 * v153; + float v163 = v4 * v160; + int64_t v206 = v2 * 2; + int64_t v213 = v2 * 3; + int64_t v220 = v2 * 4; + int64_t v227 = v2 * 5; + int64_t v234 = v2 * 6; + int64_t v241 = v2 * 7; + int64_t v248 = v2 * 8; + const float32x2_t *v333 = &v5[0]; + svint64_t v334 = svindex_s64(0, v1); + svfloat32_t v337 = svdup_n_f32(v107); + svfloat32_t v339 = svdup_n_f32(v119); + svfloat32_t v341 = svdup_n_f32(v131); + svfloat32_t v342 = svdup_n_f32(v136); + svfloat32_t v343 = svdup_n_f32(v141); + float32x2_t *v354 = &v6[0]; + svfloat32_t v262 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v260), v334)); + const float32x2_t *v269 = &v5[v26]; + const float32x2_t *v278 = &v5[v35]; + const float32x2_t *v287 = &v5[v42]; + const float32x2_t *v296 = &v5[v51]; + const float32x2_t *v305 = &v5[v58]; + const float32x2_t *v314 = &v5[v67]; + const float32x2_t *v323 = &v5[v74]; + svfloat32_t v335 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v333), v334)); + svfloat32_t v340 = svdup_n_f32(v127); + svfloat32_t v344 = svdup_n_f32(v149); + svfloat32_t v345 = svdup_n_f32(v156); + svfloat32_t v346 = svdup_n_f32(v163); + float32x2_t *v372 = &v6[v206]; + float32x2_t *v381 = &v6[v213]; + float32x2_t *v390 = &v6[v220]; + float32x2_t *v399 = &v6[v227]; + float32x2_t *v408 = &v6[v234]; + float32x2_t *v417 = &v6[v241]; + float32x2_t *v426 = &v6[v248]; + svfloat32_t v271 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v269), v334)); + svfloat32_t v280 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v278), v334)); + svfloat32_t v289 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v287), v334)); + svfloat32_t v298 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v296), v334)); + svfloat32_t v307 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v305), v334)); + svfloat32_t v316 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v314), v334)); + svfloat32_t v325 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v323), v334)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v262, v271); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v262, v271); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v280, v289); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v280, v289); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v298, v307); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v298, v307); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v316, v325); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v316, v325); + svfloat32_t v82 = svadd_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v93 = svadd_f32_x(svptrue_b32(), v33, v49); + svfloat32_t v95 = svsub_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v96 = svsub_f32_x(svptrue_b32(), v48, v80); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v80, v32); + svfloat32_t v98 = svsub_f32_x(svptrue_b32(), v33, v49); + svfloat32_t v99 = svsub_f32_x(svptrue_b32(), v49, v81); + svfloat32_t v100 = svsub_f32_x(svptrue_b32(), v81, v33); + svfloat32_t zero129 = svdup_n_f32(0); + svfloat32_t v129 = svcmla_f32_x(pred_full, zero129, v340, v65, 90); + svfloat32_t v83 = svadd_f32_x(svptrue_b32(), v82, v80); + svfloat32_t v94 = svadd_f32_x(svptrue_b32(), v93, v81); + svfloat32_t zero151 = svdup_n_f32(0); + svfloat32_t v151 = svcmla_f32_x(pred_full, zero151, v344, v98, 90); + svfloat32_t zero158 = svdup_n_f32(0); + svfloat32_t v158 = svcmla_f32_x(pred_full, zero158, v345, v99, 90); + svfloat32_t zero165 = svdup_n_f32(0); + svfloat32_t v165 = svcmla_f32_x(pred_full, zero165, v346, v100, 90); + svfloat32_t v84 = svadd_f32_x(svptrue_b32(), v83, v64); + svfloat32_t v110 = svmul_f32_x(svptrue_b32(), v83, v337); + svfloat32_t zero117 = svdup_n_f32(0); + svfloat32_t v117 = svcmla_f32_x(pred_full, zero117, v340, v94, 90); + svfloat32_t v179 = svadd_f32_x(svptrue_b32(), v129, v151); + svfloat32_t v181 = svsub_f32_x(svptrue_b32(), v129, v158); + svfloat32_t v183 = svsub_f32_x(svptrue_b32(), v129, v151); + svfloat32_t v92 = svadd_f32_x(svptrue_b32(), v84, v335); + svfloat32_t v166 = svadd_f32_x(svptrue_b32(), v110, v110); + svfloat32_t v180 = svadd_f32_x(svptrue_b32(), v179, v158); + svfloat32_t v182 = svadd_f32_x(svptrue_b32(), v181, v165); + svfloat32_t v184 = svsub_f32_x(svptrue_b32(), v183, v165); + svfloat32_t v167 = svmla_f32_x(pred_full, v166, v83, v337); + svfloat32_t v171 = svmla_f32_x(pred_full, v92, v64, v339); + svst1_f64(pred_full, (double *)(v354), svreinterpret_f64_f32(v92)); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v92, v167); + svfloat32_t v172 = svadd_f32_x(svptrue_b32(), v171, v166); + svfloat32_t v169 = svadd_f32_x(svptrue_b32(), v168, v117); + svfloat32_t v170 = svsub_f32_x(svptrue_b32(), v168, v117); + svfloat32_t v173 = svmla_f32_x(pred_full, v172, v95, v341); + svfloat32_t v175 = svmls_f32_x(pred_full, v172, v96, v342); + svfloat32_t v177 = svmls_f32_x(pred_full, v172, v95, v341); + svfloat32_t v174 = svmla_f32_x(pred_full, v173, v96, v342); + svfloat32_t v176 = svmla_f32_x(pred_full, v175, v97, v343); + svfloat32_t v178 = svmls_f32_x(pred_full, v177, v97, v343); + svst1_f64(pred_full, (double *)(v381), svreinterpret_f64_f32(v170)); + svst1_f64(pred_full, (double *)(v408), svreinterpret_f64_f32(v169)); + svfloat32_t v185 = svadd_f32_x(svptrue_b32(), v174, v180); + svfloat32_t v186 = svsub_f32_x(svptrue_b32(), v174, v180); + svfloat32_t v187 = svadd_f32_x(svptrue_b32(), v176, v182); + svfloat32_t v188 = svsub_f32_x(svptrue_b32(), v176, v182); + svfloat32_t v189 = svadd_f32_x(svptrue_b32(), v178, v184); + svfloat32_t v190 = svsub_f32_x(svptrue_b32(), v178, v184); + svst1_f64(pred_full, (double *)(v363), svreinterpret_f64_f32(v186)); + svst1_f64(pred_full, (double *)(v372), svreinterpret_f64_f32(v187)); + svst1_f64(pred_full, (double *)(v390), svreinterpret_f64_f32(v190)); + svst1_f64(pred_full, (double *)(v399), svreinterpret_f64_f32(v189)); + svst1_f64(pred_full, (double *)(v417), svreinterpret_f64_f32(v188)); + svst1_f64(pred_full, (double *)(v426), svreinterpret_f64_f32(v185)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_gu11(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v20 = v5[istride]; + float v113 = 1.1000000000000001e+00F; + float v116 = 3.3166247903554003e-01F; + float v117 = -3.3166247903554003e-01F; + float v124 = 5.1541501300188641e-01F; + float v128 = 9.4125353283118118e-01F; + float v132 = 1.4143537075597825e+00F; + float v136 = 8.5949297361449750e-01F; + float v140 = 4.2314838273285138e-02F; + float v144 = 3.8639279888589606e-01F; + float v148 = 5.1254589567200015e-01F; + float v152 = 1.0702757469471715e+00F; + float v156 = 5.5486073394528512e-01F; + float v159 = 1.2412944743900585e+00F; + float v160 = -1.2412944743900585e+00F; + float v166 = 2.0897833842005756e-01F; + float v167 = -2.0897833842005756e-01F; + float v173 = 3.7415717312460811e-01F; + float v174 = -3.7415717312460811e-01F; + float v180 = 4.9929922194110327e-02F; + float v181 = -4.9929922194110327e-02F; + float v187 = 6.5815896284539266e-01F; + float v188 = -6.5815896284539266e-01F; + float v194 = 6.3306543373877577e-01F; + float v195 = -6.3306543373877577e-01F; + float v201 = 1.0822460581641109e+00F; + float v202 = -1.0822460581641109e+00F; + float v208 = 8.1720737907134022e-01F; + float v209 = -8.1720737907134022e-01F; + float v215 = 4.2408709531871824e-01F; + float v216 = -4.2408709531871824e-01F; + float32x2_t v218 = (float32x2_t){v4, v4}; + float32x2_t v86 = v5[0]; + float32x2_t v114 = (float32x2_t){v113, v113}; + float32x2_t v118 = (float32x2_t){v116, v117}; + float32x2_t v125 = (float32x2_t){v124, v124}; + float32x2_t v129 = (float32x2_t){v128, v128}; + float32x2_t v133 = (float32x2_t){v132, v132}; + float32x2_t v137 = (float32x2_t){v136, v136}; + float32x2_t v141 = (float32x2_t){v140, v140}; + float32x2_t v145 = (float32x2_t){v144, v144}; + float32x2_t v149 = (float32x2_t){v148, v148}; + float32x2_t v153 = (float32x2_t){v152, v152}; + float32x2_t v157 = (float32x2_t){v156, v156}; + float32x2_t v161 = (float32x2_t){v159, v160}; + float32x2_t v168 = (float32x2_t){v166, v167}; + float32x2_t v175 = (float32x2_t){v173, v174}; + float32x2_t v182 = (float32x2_t){v180, v181}; + float32x2_t v189 = (float32x2_t){v187, v188}; + float32x2_t v196 = (float32x2_t){v194, v195}; + float32x2_t v203 = (float32x2_t){v201, v202}; + float32x2_t v210 = (float32x2_t){v208, v209}; + float32x2_t v217 = (float32x2_t){v215, v216}; + float32x2_t v25 = v5[istride * 10]; + float32x2_t v31 = v5[istride * 2]; + float32x2_t v36 = v5[istride * 9]; + float32x2_t v42 = v5[istride * 3]; + float32x2_t v47 = v5[istride * 8]; + float32x2_t v53 = v5[istride * 4]; + float32x2_t v58 = v5[istride * 7]; + float32x2_t v64 = v5[istride * 5]; + float32x2_t v69 = v5[istride * 6]; + float32x2_t v120 = vmul_f32(v218, v118); + float32x2_t v163 = vmul_f32(v218, v161); + float32x2_t v170 = vmul_f32(v218, v168); + float32x2_t v177 = vmul_f32(v218, v175); + float32x2_t v184 = vmul_f32(v218, v182); + float32x2_t v191 = vmul_f32(v218, v189); + float32x2_t v198 = vmul_f32(v218, v196); + float32x2_t v205 = vmul_f32(v218, v203); + float32x2_t v212 = vmul_f32(v218, v210); + float32x2_t v219 = vmul_f32(v218, v217); + float32x2_t v26 = vadd_f32(v20, v25); + float32x2_t v37 = vadd_f32(v31, v36); + float32x2_t v48 = vadd_f32(v42, v47); + float32x2_t v59 = vadd_f32(v53, v58); + float32x2_t v70 = vadd_f32(v64, v69); + float32x2_t v71 = vsub_f32(v20, v25); + float32x2_t v72 = vsub_f32(v31, v36); + float32x2_t v73 = vsub_f32(v42, v47); + float32x2_t v74 = vsub_f32(v53, v58); + float32x2_t v75 = vsub_f32(v64, v69); + float32x2_t v76 = vadd_f32(v26, v37); + float32x2_t v77 = vadd_f32(v48, v70); + float32x2_t v79 = vsub_f32(v72, v73); + float32x2_t v80 = vadd_f32(v71, v75); + float32x2_t v90 = vsub_f32(v37, v59); + float32x2_t v91 = vsub_f32(v26, v59); + float32x2_t v92 = vsub_f32(v37, v26); + float32x2_t v93 = vsub_f32(v70, v59); + float32x2_t v94 = vsub_f32(v48, v59); + float32x2_t v95 = vsub_f32(v70, v48); + float32x2_t v96 = vsub_f32(v37, v70); + float32x2_t v97 = vsub_f32(v26, v48); + float32x2_t v99 = vadd_f32(v72, v74); + float32x2_t v100 = vsub_f32(v71, v74); + float32x2_t v101 = vadd_f32(v71, v72); + float32x2_t v102 = vsub_f32(v74, v75); + float32x2_t v103 = vsub_f32(v73, v74); + float32x2_t v104 = vsub_f32(v73, v75); + float32x2_t v105 = vadd_f32(v72, v75); + float32x2_t v106 = vsub_f32(v71, v73); + float32x2_t v78 = vadd_f32(v59, v76); + float32x2_t v88 = vsub_f32(v79, v80); + float32x2_t v98 = vsub_f32(v77, v76); + float32x2_t v107 = vadd_f32(v79, v80); + float32x2_t v126 = vmul_f32(v90, v125); + float32x2_t v130 = vmul_f32(v91, v129); + float32x2_t v134 = vmul_f32(v92, v133); + float32x2_t v138 = vmul_f32(v93, v137); + float32x2_t v142 = vmul_f32(v94, v141); + float32x2_t v146 = vmul_f32(v95, v145); + float32x2_t v150 = vmul_f32(v96, v149); + float32x2_t v154 = vmul_f32(v97, v153); + float32x2_t v164 = vrev64_f32(v99); + float32x2_t v171 = vrev64_f32(v100); + float32x2_t v178 = vrev64_f32(v101); + float32x2_t v185 = vrev64_f32(v102); + float32x2_t v192 = vrev64_f32(v103); + float32x2_t v199 = vrev64_f32(v104); + float32x2_t v206 = vrev64_f32(v105); + float32x2_t v213 = vrev64_f32(v106); + float32x2_t v81 = vadd_f32(v78, v77); + float32x2_t v89 = vsub_f32(v88, v74); + float32x2_t v158 = vmul_f32(v98, v157); + float32x2_t v165 = vmul_f32(v164, v163); + float32x2_t v172 = vmul_f32(v171, v170); + float32x2_t v179 = vmul_f32(v178, v177); + float32x2_t v186 = vmul_f32(v185, v184); + float32x2_t v193 = vmul_f32(v192, v191); + float32x2_t v200 = vmul_f32(v199, v198); + float32x2_t v207 = vmul_f32(v206, v205); + float32x2_t v214 = vmul_f32(v213, v212); + float32x2_t v220 = vrev64_f32(v107); + float32x2_t v223 = vadd_f32(v126, v130); + float32x2_t v224 = vadd_f32(v130, v134); + float32x2_t v225 = vsub_f32(v126, v134); + float32x2_t v226 = vadd_f32(v138, v142); + float32x2_t v227 = vadd_f32(v142, v146); + float32x2_t v228 = vsub_f32(v138, v146); + float32x2_t v87 = vadd_f32(v86, v81); + float32x2_t v115 = vmul_f32(v81, v114); + float32x2_t v121 = vrev64_f32(v89); + float32x2_t v221 = vmul_f32(v220, v219); + float32x2_t v229 = vadd_f32(v154, v158); + float32x2_t v230 = vadd_f32(v150, v158); + float32x2_t v231 = vadd_f32(v172, v179); + float32x2_t v232 = vsub_f32(v165, v179); + float32x2_t v233 = vadd_f32(v193, v200); + float32x2_t v234 = vsub_f32(v186, v200); + float32x2_t v122 = vmul_f32(v121, v120); + float32x2_t v222 = vsub_f32(v87, v115); + float32x2_t v235 = vadd_f32(v214, v221); + float32x2_t v236 = vsub_f32(v207, v221); + float32x2_t v237 = vadd_f32(v227, v229); + float32x2_t v255 = vadd_f32(v231, v232); + v6[0] = v87; + float32x2_t v238 = vadd_f32(v237, v222); + float32x2_t v239 = vsub_f32(v222, v224); + float32x2_t v241 = vadd_f32(v222, v228); + float32x2_t v243 = vsub_f32(v222, v225); + float32x2_t v245 = vadd_f32(v222, v223); + float32x2_t v247 = vadd_f32(v122, v233); + float32x2_t v249 = vsub_f32(v235, v231); + float32x2_t v251 = vadd_f32(v122, v236); + float32x2_t v253 = vsub_f32(v236, v232); + float32x2_t v256 = vadd_f32(v255, v233); + float32x2_t v240 = vsub_f32(v239, v229); + float32x2_t v242 = vadd_f32(v241, v230); + float32x2_t v244 = vsub_f32(v243, v230); + float32x2_t v246 = vsub_f32(v245, v226); + float32x2_t v248 = vadd_f32(v247, v235); + float32x2_t v250 = vsub_f32(v249, v122); + float32x2_t v252 = vadd_f32(v251, v234); + float32x2_t v254 = vsub_f32(v253, v122); + float32x2_t v257 = vadd_f32(v256, v234); + float32x2_t v258 = vsub_f32(v257, v122); + float32x2_t v260 = vadd_f32(v238, v248); + float32x2_t v261 = vadd_f32(v240, v250); + float32x2_t v262 = vsub_f32(v242, v252); + float32x2_t v263 = vadd_f32(v244, v254); + float32x2_t v264 = vsub_f32(v244, v254); + float32x2_t v265 = vadd_f32(v242, v252); + float32x2_t v266 = vsub_f32(v240, v250); + float32x2_t v267 = vsub_f32(v238, v248); + float32x2_t v259 = vadd_f32(v246, v258); + float32x2_t v268 = vsub_f32(v246, v258); + v6[ostride * 9] = v260; + v6[ostride * 8] = v261; + v6[ostride * 7] = v262; + v6[ostride * 6] = v263; + v6[ostride * 5] = v264; + v6[ostride * 4] = v265; + v6[ostride * 3] = v266; + v6[ostride * 2] = v267; + v6[ostride * 10] = v259; + v6[ostride] = v268; + v5 += 1 * idist; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_gu11(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v138 = 1.1000000000000001e+00F; + float v143 = -3.3166247903554003e-01F; + float v150 = 5.1541501300188641e-01F; + float v155 = 9.4125353283118118e-01F; + float v160 = 1.4143537075597825e+00F; + float v165 = 8.5949297361449750e-01F; + float v170 = 4.2314838273285138e-02F; + float v175 = 3.8639279888589606e-01F; + float v180 = 5.1254589567200015e-01F; + float v185 = 1.0702757469471715e+00F; + float v190 = 5.5486073394528512e-01F; + float v195 = -1.2412944743900585e+00F; + float v202 = -2.0897833842005756e-01F; + float v209 = -3.7415717312460811e-01F; + float v216 = -4.9929922194110327e-02F; + float v223 = -6.5815896284539266e-01F; + float v230 = -6.3306543373877577e-01F; + float v237 = -1.0822460581641109e+00F; + float v244 = -8.1720737907134022e-01F; + float v251 = -4.2408709531871824e-01F; + const float32x2_t *v387 = &v5[v0]; + float32x2_t *v599 = &v6[v2]; + int64_t v26 = v0 * 10; + int64_t v34 = v0 * 2; + int64_t v41 = v0 * 9; + int64_t v49 = v0 * 3; + int64_t v56 = v0 * 8; + int64_t v64 = v0 * 4; + int64_t v71 = v0 * 7; + int64_t v79 = v0 * 5; + int64_t v86 = v0 * 6; + float v146 = v4 * v143; + float v198 = v4 * v195; + float v205 = v4 * v202; + float v212 = v4 * v209; + float v219 = v4 * v216; + float v226 = v4 * v223; + float v233 = v4 * v230; + float v240 = v4 * v237; + float v247 = v4 * v244; + float v254 = v4 * v251; + int64_t v312 = v2 * 10; + int64_t v319 = v2 * 9; + int64_t v326 = v2 * 8; + int64_t v333 = v2 * 7; + int64_t v340 = v2 * 6; + int64_t v347 = v2 * 5; + int64_t v354 = v2 * 4; + int64_t v361 = v2 * 3; + int64_t v368 = v2 * 2; + const float32x2_t *v478 = &v5[0]; + svint64_t v479 = svindex_s64(0, v1); + svfloat32_t v482 = svdup_n_f32(v138); + svfloat32_t v484 = svdup_n_f32(v150); + svfloat32_t v485 = svdup_n_f32(v155); + svfloat32_t v486 = svdup_n_f32(v160); + svfloat32_t v487 = svdup_n_f32(v165); + svfloat32_t v488 = svdup_n_f32(v170); + svfloat32_t v489 = svdup_n_f32(v175); + svfloat32_t v490 = svdup_n_f32(v180); + svfloat32_t v491 = svdup_n_f32(v185); + svfloat32_t v492 = svdup_n_f32(v190); + float32x2_t *v509 = &v6[0]; + svfloat32_t v389 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v387), v479)); + const float32x2_t *v396 = &v5[v26]; + const float32x2_t *v405 = &v5[v34]; + const float32x2_t *v414 = &v5[v41]; + const float32x2_t *v423 = &v5[v49]; + const float32x2_t *v432 = &v5[v56]; + const float32x2_t *v441 = &v5[v64]; + const float32x2_t *v450 = &v5[v71]; + const float32x2_t *v459 = &v5[v79]; + const float32x2_t *v468 = &v5[v86]; + svfloat32_t v480 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v478), v479)); + svfloat32_t v483 = svdup_n_f32(v146); + svfloat32_t v493 = svdup_n_f32(v198); + svfloat32_t v494 = svdup_n_f32(v205); + svfloat32_t v495 = svdup_n_f32(v212); + svfloat32_t v496 = svdup_n_f32(v219); + svfloat32_t v497 = svdup_n_f32(v226); + svfloat32_t v498 = svdup_n_f32(v233); + svfloat32_t v499 = svdup_n_f32(v240); + svfloat32_t v500 = svdup_n_f32(v247); + svfloat32_t v501 = svdup_n_f32(v254); + float32x2_t *v518 = &v6[v312]; + float32x2_t *v527 = &v6[v319]; + float32x2_t *v536 = &v6[v326]; + float32x2_t *v545 = &v6[v333]; + float32x2_t *v554 = &v6[v340]; + float32x2_t *v563 = &v6[v347]; + float32x2_t *v572 = &v6[v354]; + float32x2_t *v581 = &v6[v361]; + float32x2_t *v590 = &v6[v368]; + svfloat32_t v398 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v396), v479)); + svfloat32_t v407 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v405), v479)); + svfloat32_t v416 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v414), v479)); + svfloat32_t v425 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v423), v479)); + svfloat32_t v434 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v432), v479)); + svfloat32_t v443 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v441), v479)); + svfloat32_t v452 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v450), v479)); + svfloat32_t v461 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v459), v479)); + svfloat32_t v470 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v468), v479)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v389, v398); + svfloat32_t v47 = svadd_f32_x(svptrue_b32(), v407, v416); + svfloat32_t v62 = svadd_f32_x(svptrue_b32(), v425, v434); + svfloat32_t v77 = svadd_f32_x(svptrue_b32(), v443, v452); + svfloat32_t v92 = svadd_f32_x(svptrue_b32(), v461, v470); + svfloat32_t v93 = svsub_f32_x(svptrue_b32(), v389, v398); + svfloat32_t v94 = svsub_f32_x(svptrue_b32(), v407, v416); + svfloat32_t v95 = svsub_f32_x(svptrue_b32(), v425, v434); + svfloat32_t v96 = svsub_f32_x(svptrue_b32(), v443, v452); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v461, v470); + svfloat32_t v98 = svadd_f32_x(svptrue_b32(), v32, v47); + svfloat32_t v99 = svadd_f32_x(svptrue_b32(), v62, v92); + svfloat32_t v101 = svsub_f32_x(svptrue_b32(), v94, v95); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v93, v97); + svfloat32_t v114 = svsub_f32_x(svptrue_b32(), v47, v77); + svfloat32_t v115 = svsub_f32_x(svptrue_b32(), v32, v77); + svfloat32_t v116 = svsub_f32_x(svptrue_b32(), v47, v32); + svfloat32_t v117 = svsub_f32_x(svptrue_b32(), v92, v77); + svfloat32_t v118 = svsub_f32_x(svptrue_b32(), v62, v77); + svfloat32_t v119 = svsub_f32_x(svptrue_b32(), v92, v62); + svfloat32_t v120 = svsub_f32_x(svptrue_b32(), v47, v92); + svfloat32_t v121 = svsub_f32_x(svptrue_b32(), v32, v62); + svfloat32_t v123 = svadd_f32_x(svptrue_b32(), v94, v96); + svfloat32_t v124 = svsub_f32_x(svptrue_b32(), v93, v96); + svfloat32_t v125 = svadd_f32_x(svptrue_b32(), v93, v94); + svfloat32_t v126 = svsub_f32_x(svptrue_b32(), v96, v97); + svfloat32_t v127 = svsub_f32_x(svptrue_b32(), v95, v96); + svfloat32_t v128 = svsub_f32_x(svptrue_b32(), v95, v97); + svfloat32_t v129 = svadd_f32_x(svptrue_b32(), v94, v97); + svfloat32_t v130 = svsub_f32_x(svptrue_b32(), v93, v95); + svfloat32_t v100 = svadd_f32_x(svptrue_b32(), v77, v98); + svfloat32_t v112 = svsub_f32_x(svptrue_b32(), v101, v102); + svfloat32_t v122 = svsub_f32_x(svptrue_b32(), v99, v98); + svfloat32_t v131 = svadd_f32_x(svptrue_b32(), v101, v102); + svfloat32_t v158 = svmul_f32_x(svptrue_b32(), v115, v485); + svfloat32_t v163 = svmul_f32_x(svptrue_b32(), v116, v486); + svfloat32_t v173 = svmul_f32_x(svptrue_b32(), v118, v488); + svfloat32_t v178 = svmul_f32_x(svptrue_b32(), v119, v489); + svfloat32_t zero200 = svdup_n_f32(0); + svfloat32_t v200 = svcmla_f32_x(pred_full, zero200, v493, v123, 90); + svfloat32_t zero214 = svdup_n_f32(0); + svfloat32_t v214 = svcmla_f32_x(pred_full, zero214, v495, v125, 90); + svfloat32_t zero221 = svdup_n_f32(0); + svfloat32_t v221 = svcmla_f32_x(pred_full, zero221, v496, v126, 90); + svfloat32_t zero235 = svdup_n_f32(0); + svfloat32_t v235 = svcmla_f32_x(pred_full, zero235, v498, v128, 90); + svfloat32_t zero242 = svdup_n_f32(0); + svfloat32_t v242 = svcmla_f32_x(pred_full, zero242, v499, v129, 90); + svfloat32_t v103 = svadd_f32_x(svptrue_b32(), v100, v99); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v112, v96); + svfloat32_t v193 = svmul_f32_x(svptrue_b32(), v122, v492); + svfloat32_t zero256 = svdup_n_f32(0); + svfloat32_t v256 = svcmla_f32_x(pred_full, zero256, v501, v131, 90); + svfloat32_t v258 = svmla_f32_x(pred_full, v158, v114, v484); + svfloat32_t v259 = svmla_f32_x(pred_full, v163, v115, v485); + svfloat32_t v260 = svnmls_f32_x(pred_full, v163, v114, v484); + svfloat32_t v261 = svmla_f32_x(pred_full, v173, v117, v487); + svfloat32_t v262 = svmla_f32_x(pred_full, v178, v118, v488); + svfloat32_t v263 = svnmls_f32_x(pred_full, v178, v117, v487); + svfloat32_t v266 = svcmla_f32_x(pred_full, v214, v494, v124, 90); + svfloat32_t v267 = svsub_f32_x(svptrue_b32(), v200, v214); + svfloat32_t v268 = svcmla_f32_x(pred_full, v235, v497, v127, 90); + svfloat32_t v269 = svsub_f32_x(svptrue_b32(), v221, v235); + svfloat32_t v111 = svadd_f32_x(svptrue_b32(), v480, v103); + svfloat32_t zero148 = svdup_n_f32(0); + svfloat32_t v148 = svcmla_f32_x(pred_full, zero148, v483, v113, 90); + svfloat32_t v264 = svmla_f32_x(pred_full, v193, v121, v491); + svfloat32_t v265 = svmla_f32_x(pred_full, v193, v120, v490); + svfloat32_t v270 = svcmla_f32_x(pred_full, v256, v500, v130, 90); + svfloat32_t v271 = svsub_f32_x(svptrue_b32(), v242, v256); + svfloat32_t v290 = svadd_f32_x(svptrue_b32(), v266, v267); + svfloat32_t v257 = svmls_f32_x(pred_full, v111, v103, v482); + svfloat32_t v272 = svadd_f32_x(svptrue_b32(), v262, v264); + svfloat32_t v282 = svadd_f32_x(svptrue_b32(), v148, v268); + svfloat32_t v284 = svsub_f32_x(svptrue_b32(), v270, v266); + svfloat32_t v286 = svadd_f32_x(svptrue_b32(), v148, v271); + svfloat32_t v288 = svsub_f32_x(svptrue_b32(), v271, v267); + svfloat32_t v291 = svadd_f32_x(svptrue_b32(), v290, v268); + svst1_f64(pred_full, (double *)(v509), svreinterpret_f64_f32(v111)); + svfloat32_t v273 = svadd_f32_x(svptrue_b32(), v272, v257); + svfloat32_t v274 = svsub_f32_x(svptrue_b32(), v257, v259); + svfloat32_t v276 = svadd_f32_x(svptrue_b32(), v257, v263); + svfloat32_t v278 = svsub_f32_x(svptrue_b32(), v257, v260); + svfloat32_t v280 = svadd_f32_x(svptrue_b32(), v257, v258); + svfloat32_t v283 = svadd_f32_x(svptrue_b32(), v282, v270); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v284, v148); + svfloat32_t v287 = svadd_f32_x(svptrue_b32(), v286, v269); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v288, v148); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v291, v269); + svfloat32_t v275 = svsub_f32_x(svptrue_b32(), v274, v264); + svfloat32_t v277 = svadd_f32_x(svptrue_b32(), v276, v265); + svfloat32_t v279 = svsub_f32_x(svptrue_b32(), v278, v265); + svfloat32_t v281 = svsub_f32_x(svptrue_b32(), v280, v261); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v292, v148); + svfloat32_t v295 = svadd_f32_x(svptrue_b32(), v273, v283); + svfloat32_t v302 = svsub_f32_x(svptrue_b32(), v273, v283); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v281, v293); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v275, v285); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v277, v287); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v279, v289); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v279, v289); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v277, v287); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v275, v285); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v281, v293); + svst1_f64(pred_full, (double *)(v527), svreinterpret_f64_f32(v295)); + svst1_f64(pred_full, (double *)(v590), svreinterpret_f64_f32(v302)); + svst1_f64(pred_full, (double *)(v518), svreinterpret_f64_f32(v294)); + svst1_f64(pred_full, (double *)(v536), svreinterpret_f64_f32(v296)); + svst1_f64(pred_full, (double *)(v545), svreinterpret_f64_f32(v297)); + svst1_f64(pred_full, (double *)(v554), svreinterpret_f64_f32(v298)); + svst1_f64(pred_full, (double *)(v563), svreinterpret_f64_f32(v299)); + svst1_f64(pred_full, (double *)(v572), svreinterpret_f64_f32(v300)); + svst1_f64(pred_full, (double *)(v581), svreinterpret_f64_f32(v301)); + svst1_f64(pred_full, (double *)(v599), svreinterpret_f64_f32(v303)); + v5 += v11; + v6 += v12; + } +} +#endif + #ifndef ARMRAL_ARCH_SVE void armral_fft_cf32_cf32_cf32_ac_n_gu13(const armral_cmplx_f32_t *restrict x, armral_cmplx_f32_t *restrict y, @@ -942,6 +1920,475 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu14(const armral_cmplx_f32_t *restrict x, } #endif +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_gu15(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v61 = v5[istride]; + float v119 = -1.2500000000000000e+00F; + float v123 = 5.5901699437494745e-01F; + float v126 = 1.5388417685876268e+00F; + float v127 = -1.5388417685876268e+00F; + float v133 = 5.8778525229247325e-01F; + float v134 = -5.8778525229247325e-01F; + float v140 = 3.6327126400268028e-01F; + float v141 = -3.6327126400268028e-01F; + float v165 = -1.4999999999999998e+00F; + float v169 = 1.8749999999999998e+00F; + float v173 = -8.3852549156242107e-01F; + float v176 = -2.3082626528814396e+00F; + float v177 = 2.3082626528814396e+00F; + float v183 = -8.8167787843870971e-01F; + float v184 = 8.8167787843870971e-01F; + float v190 = -5.4490689600402031e-01F; + float v191 = 5.4490689600402031e-01F; + float v214 = 8.6602540378443871e-01F; + float v215 = -8.6602540378443871e-01F; + float v221 = -1.0825317547305484e+00F; + float v222 = 1.0825317547305484e+00F; + float v228 = 4.8412291827592718e-01F; + float v229 = -4.8412291827592718e-01F; + float32x2_t v231 = (float32x2_t){v4, v4}; + float v236 = -1.3326760640014592e+00F; + float v240 = -5.0903696045512736e-01F; + float v244 = -3.1460214309120460e-01F; + float32x2_t v32 = v5[0]; + float32x2_t v120 = (float32x2_t){v119, v119}; + float32x2_t v124 = (float32x2_t){v123, v123}; + float32x2_t v128 = (float32x2_t){v126, v127}; + float32x2_t v135 = (float32x2_t){v133, v134}; + float32x2_t v142 = (float32x2_t){v140, v141}; + float32x2_t v166 = (float32x2_t){v165, v165}; + float32x2_t v170 = (float32x2_t){v169, v169}; + float32x2_t v174 = (float32x2_t){v173, v173}; + float32x2_t v178 = (float32x2_t){v176, v177}; + float32x2_t v185 = (float32x2_t){v183, v184}; + float32x2_t v192 = (float32x2_t){v190, v191}; + float32x2_t v216 = (float32x2_t){v214, v215}; + float32x2_t v223 = (float32x2_t){v221, v222}; + float32x2_t v230 = (float32x2_t){v228, v229}; + float32x2_t v237 = (float32x2_t){v236, v236}; + float32x2_t v241 = (float32x2_t){v240, v240}; + float32x2_t v245 = (float32x2_t){v244, v244}; + float32x2_t v20 = v5[istride * 5]; + float32x2_t v25 = v5[istride * 10]; + float32x2_t v38 = v5[istride * 8]; + float32x2_t v43 = v5[istride * 13]; + float32x2_t v50 = v5[istride * 3]; + float32x2_t v56 = v5[istride * 11]; + float32x2_t v68 = v5[istride * 6]; + float32x2_t v74 = v5[istride * 14]; + float32x2_t v79 = v5[istride * 4]; + float32x2_t v86 = v5[istride * 9]; + float32x2_t v92 = v5[istride * 2]; + float32x2_t v97 = v5[istride * 7]; + float32x2_t v104 = v5[istride * 12]; + float32x2_t v130 = vmul_f32(v231, v128); + float32x2_t v137 = vmul_f32(v231, v135); + float32x2_t v144 = vmul_f32(v231, v142); + float32x2_t v180 = vmul_f32(v231, v178); + float32x2_t v187 = vmul_f32(v231, v185); + float32x2_t v194 = vmul_f32(v231, v192); + float32x2_t v218 = vmul_f32(v231, v216); + float32x2_t v225 = vmul_f32(v231, v223); + float32x2_t v232 = vmul_f32(v231, v230); + float32x2_t v26 = vadd_f32(v20, v25); + float32x2_t v27 = vsub_f32(v20, v25); + float32x2_t v44 = vadd_f32(v38, v43); + float32x2_t v45 = vsub_f32(v38, v43); + float32x2_t v62 = vadd_f32(v56, v61); + float32x2_t v63 = vsub_f32(v56, v61); + float32x2_t v80 = vadd_f32(v74, v79); + float32x2_t v81 = vsub_f32(v74, v79); + float32x2_t v98 = vadd_f32(v92, v97); + float32x2_t v99 = vsub_f32(v92, v97); + float32x2_t v33 = vadd_f32(v26, v32); + float32x2_t v51 = vadd_f32(v44, v50); + float32x2_t v69 = vadd_f32(v62, v68); + float32x2_t v87 = vadd_f32(v80, v86); + float32x2_t v105 = vadd_f32(v98, v104); + float32x2_t v156 = vadd_f32(v44, v98); + float32x2_t v157 = vsub_f32(v44, v98); + float32x2_t v158 = vadd_f32(v80, v62); + float32x2_t v159 = vsub_f32(v80, v62); + float32x2_t v206 = vadd_f32(v45, v99); + float32x2_t v207 = vsub_f32(v45, v99); + float32x2_t v208 = vadd_f32(v81, v63); + float32x2_t v209 = vsub_f32(v81, v63); + float32x2_t v106 = vadd_f32(v51, v105); + float32x2_t v107 = vsub_f32(v51, v105); + float32x2_t v108 = vadd_f32(v87, v69); + float32x2_t v109 = vsub_f32(v87, v69); + float32x2_t v160 = vadd_f32(v156, v158); + float32x2_t v161 = vsub_f32(v156, v158); + float32x2_t v162 = vadd_f32(v157, v159); + float32x2_t v181 = vrev64_f32(v157); + float32x2_t v195 = vrev64_f32(v159); + float32x2_t v210 = vadd_f32(v206, v208); + float32x2_t v211 = vsub_f32(v206, v208); + float32x2_t v212 = vadd_f32(v207, v209); + float32x2_t v238 = vmul_f32(v207, v237); + float32x2_t v246 = vmul_f32(v209, v245); + float32x2_t v110 = vadd_f32(v106, v108); + float32x2_t v111 = vsub_f32(v106, v108); + float32x2_t v112 = vadd_f32(v107, v109); + float32x2_t v131 = vrev64_f32(v107); + float32x2_t v145 = vrev64_f32(v109); + float32x2_t v163 = vadd_f32(v160, v26); + float32x2_t v171 = vmul_f32(v160, v170); + float32x2_t v175 = vmul_f32(v161, v174); + float32x2_t v182 = vmul_f32(v181, v180); + float32x2_t v188 = vrev64_f32(v162); + float32x2_t v196 = vmul_f32(v195, v194); + float32x2_t v213 = vadd_f32(v210, v27); + float32x2_t v226 = vrev64_f32(v210); + float32x2_t v233 = vrev64_f32(v211); + float32x2_t v242 = vmul_f32(v212, v241); + float32x2_t v113 = vadd_f32(v110, v33); + float32x2_t v121 = vmul_f32(v110, v120); + float32x2_t v125 = vmul_f32(v111, v124); + float32x2_t v132 = vmul_f32(v131, v130); + float32x2_t v138 = vrev64_f32(v112); + float32x2_t v146 = vmul_f32(v145, v144); + float32x2_t v167 = vmul_f32(v163, v166); + float32x2_t v189 = vmul_f32(v188, v187); + float32x2_t v219 = vrev64_f32(v213); + float32x2_t v227 = vmul_f32(v226, v225); + float32x2_t v234 = vmul_f32(v233, v232); + float32x2_t v250 = vsub_f32(v238, v242); + float32x2_t v251 = vadd_f32(v242, v246); + float32x2_t v139 = vmul_f32(v138, v137); + float32x2_t v147 = vadd_f32(v113, v121); + float32x2_t v197 = vadd_f32(v167, v171); + float32x2_t v200 = vsub_f32(v182, v189); + float32x2_t v201 = vadd_f32(v189, v196); + float32x2_t v220 = vmul_f32(v219, v218); + float32x2_t v256 = vadd_f32(v113, v167); + v6[0] = v113; + float32x2_t v148 = vadd_f32(v147, v125); + float32x2_t v149 = vsub_f32(v147, v125); + float32x2_t v150 = vsub_f32(v132, v139); + float32x2_t v151 = vadd_f32(v139, v146); + float32x2_t v198 = vadd_f32(v197, v175); + float32x2_t v199 = vsub_f32(v197, v175); + float32x2_t v247 = vadd_f32(v220, v227); + float32x2_t v257 = vadd_f32(v256, v220); + float32x2_t v258 = vsub_f32(v256, v220); + float32x2_t v152 = vadd_f32(v148, v150); + float32x2_t v153 = vsub_f32(v148, v150); + float32x2_t v154 = vadd_f32(v149, v151); + float32x2_t v155 = vsub_f32(v149, v151); + float32x2_t v202 = vadd_f32(v198, v200); + float32x2_t v203 = vsub_f32(v198, v200); + float32x2_t v204 = vadd_f32(v199, v201); + float32x2_t v205 = vsub_f32(v199, v201); + float32x2_t v248 = vadd_f32(v247, v234); + float32x2_t v249 = vsub_f32(v247, v234); + v6[ostride * 10] = v258; + v6[ostride * 5] = v257; + float32x2_t v252 = vadd_f32(v248, v250); + float32x2_t v253 = vsub_f32(v248, v250); + float32x2_t v254 = vadd_f32(v249, v251); + float32x2_t v255 = vsub_f32(v249, v251); + float32x2_t v274 = vadd_f32(v153, v203); + v6[ostride * 6] = v153; + float32x2_t v292 = vadd_f32(v155, v205); + v6[ostride * 12] = v155; + float32x2_t v310 = vadd_f32(v154, v204); + v6[ostride * 3] = v154; + float32x2_t v328 = vadd_f32(v152, v202); + v6[ostride * 9] = v152; + float32x2_t v275 = vadd_f32(v274, v253); + float32x2_t v276 = vsub_f32(v274, v253); + float32x2_t v293 = vadd_f32(v292, v255); + float32x2_t v294 = vsub_f32(v292, v255); + float32x2_t v311 = vadd_f32(v310, v254); + float32x2_t v312 = vsub_f32(v310, v254); + float32x2_t v329 = vadd_f32(v328, v252); + float32x2_t v330 = vsub_f32(v328, v252); + v6[ostride] = v276; + v6[ostride * 11] = v275; + v6[ostride * 7] = v294; + v6[ostride * 2] = v293; + v6[ostride * 13] = v312; + v6[ostride * 8] = v311; + v6[ostride * 4] = v330; + v6[ostride * 14] = v329; + v5 += 1 * idist; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_gu15(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v152 = -1.2500000000000000e+00F; + float v157 = 5.5901699437494745e-01F; + float v162 = -1.5388417685876268e+00F; + float v169 = -5.8778525229247325e-01F; + float v176 = -3.6327126400268028e-01F; + float v200 = -1.4999999999999998e+00F; + float v205 = 1.8749999999999998e+00F; + float v210 = -8.3852549156242107e-01F; + float v215 = 2.3082626528814396e+00F; + float v222 = 8.8167787843870971e-01F; + float v229 = 5.4490689600402031e-01F; + float v253 = -8.6602540378443871e-01F; + float v260 = 1.0825317547305484e+00F; + float v267 = -4.8412291827592718e-01F; + float v274 = -1.3326760640014592e+00F; + float v279 = -5.0903696045512736e-01F; + float v284 = -3.1460214309120460e-01F; + const float32x2_t *v487 = &v5[v0]; + float32x2_t *v614 = &v6[v2]; + int64_t v19 = v0 * 5; + int64_t v26 = v0 * 10; + int64_t v43 = v0 * 8; + int64_t v50 = v0 * 13; + int64_t v59 = v0 * 3; + int64_t v67 = v0 * 11; + int64_t v83 = v0 * 6; + int64_t v91 = v0 * 14; + int64_t v98 = v0 * 4; + int64_t v107 = v0 * 9; + int64_t v115 = v0 * 2; + int64_t v122 = v0 * 7; + int64_t v131 = v0 * 12; + float v165 = v4 * v162; + float v172 = v4 * v169; + float v179 = v4 * v176; + float v218 = v4 * v215; + float v225 = v4 * v222; + float v232 = v4 * v229; + float v256 = v4 * v253; + float v263 = v4 * v260; + float v270 = v4 * v267; + int64_t v308 = v2 * 10; + int64_t v315 = v2 * 5; + int64_t v325 = v2 * 6; + int64_t v339 = v2 * 11; + int64_t v349 = v2 * 12; + int64_t v356 = v2 * 7; + int64_t v363 = v2 * 2; + int64_t v373 = v2 * 3; + int64_t v380 = v2 * 13; + int64_t v387 = v2 * 8; + int64_t v397 = v2 * 9; + int64_t v404 = v2 * 4; + int64_t v411 = v2 * 14; + const float32x2_t *v442 = &v5[0]; + svint64_t v551 = svindex_s64(0, v1); + svfloat32_t v554 = svdup_n_f32(v152); + svfloat32_t v555 = svdup_n_f32(v157); + svfloat32_t v559 = svdup_n_f32(v200); + svfloat32_t v560 = svdup_n_f32(v205); + svfloat32_t v561 = svdup_n_f32(v210); + svfloat32_t v568 = svdup_n_f32(v274); + svfloat32_t v569 = svdup_n_f32(v279); + svfloat32_t v570 = svdup_n_f32(v284); + float32x2_t *v578 = &v6[0]; + const float32x2_t *v423 = &v5[v19]; + const float32x2_t *v432 = &v5[v26]; + svfloat32_t v444 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v442), v551)); + const float32x2_t *v451 = &v5[v43]; + const float32x2_t *v460 = &v5[v50]; + const float32x2_t *v469 = &v5[v59]; + const float32x2_t *v478 = &v5[v67]; + svfloat32_t v489 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v487), v551)); + const float32x2_t *v496 = &v5[v83]; + const float32x2_t *v505 = &v5[v91]; + const float32x2_t *v514 = &v5[v98]; + const float32x2_t *v523 = &v5[v107]; + const float32x2_t *v532 = &v5[v115]; + const float32x2_t *v541 = &v5[v122]; + const float32x2_t *v550 = &v5[v131]; + svfloat32_t v556 = svdup_n_f32(v165); + svfloat32_t v557 = svdup_n_f32(v172); + svfloat32_t v558 = svdup_n_f32(v179); + svfloat32_t v562 = svdup_n_f32(v218); + svfloat32_t v563 = svdup_n_f32(v225); + svfloat32_t v564 = svdup_n_f32(v232); + svfloat32_t v565 = svdup_n_f32(v256); + svfloat32_t v566 = svdup_n_f32(v263); + svfloat32_t v567 = svdup_n_f32(v270); + float32x2_t *v587 = &v6[v308]; + float32x2_t *v596 = &v6[v315]; + float32x2_t *v605 = &v6[v325]; + float32x2_t *v623 = &v6[v339]; + float32x2_t *v632 = &v6[v349]; + float32x2_t *v641 = &v6[v356]; + float32x2_t *v650 = &v6[v363]; + float32x2_t *v659 = &v6[v373]; + float32x2_t *v668 = &v6[v380]; + float32x2_t *v677 = &v6[v387]; + float32x2_t *v686 = &v6[v397]; + float32x2_t *v695 = &v6[v404]; + float32x2_t *v704 = &v6[v411]; + svfloat32_t v425 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v423), v551)); + svfloat32_t v434 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v432), v551)); + svfloat32_t v453 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v451), v551)); + svfloat32_t v462 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v460), v551)); + svfloat32_t v471 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v469), v551)); + svfloat32_t v480 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v478), v551)); + svfloat32_t v498 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v496), v551)); + svfloat32_t v507 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v505), v551)); + svfloat32_t v516 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v514), v551)); + svfloat32_t v525 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v523), v551)); + svfloat32_t v534 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v532), v551)); + svfloat32_t v543 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v541), v551)); + svfloat32_t v552 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v550), v551)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v425, v434); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v425, v434); + svfloat32_t v56 = svadd_f32_x(svptrue_b32(), v453, v462); + svfloat32_t v57 = svsub_f32_x(svptrue_b32(), v453, v462); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v480, v489); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v480, v489); + svfloat32_t v104 = svadd_f32_x(svptrue_b32(), v507, v516); + svfloat32_t v105 = svsub_f32_x(svptrue_b32(), v507, v516); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v534, v543); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v534, v543); + svfloat32_t v41 = svadd_f32_x(svptrue_b32(), v32, v444); + svfloat32_t v65 = svadd_f32_x(svptrue_b32(), v56, v471); + svfloat32_t v89 = svadd_f32_x(svptrue_b32(), v80, v498); + svfloat32_t v113 = svadd_f32_x(svptrue_b32(), v104, v525); + svfloat32_t v137 = svadd_f32_x(svptrue_b32(), v128, v552); + svfloat32_t v191 = svadd_f32_x(svptrue_b32(), v56, v128); + svfloat32_t v192 = svsub_f32_x(svptrue_b32(), v56, v128); + svfloat32_t v193 = svadd_f32_x(svptrue_b32(), v104, v80); + svfloat32_t v194 = svsub_f32_x(svptrue_b32(), v104, v80); + svfloat32_t v244 = svadd_f32_x(svptrue_b32(), v57, v129); + svfloat32_t v245 = svsub_f32_x(svptrue_b32(), v57, v129); + svfloat32_t v246 = svadd_f32_x(svptrue_b32(), v105, v81); + svfloat32_t v247 = svsub_f32_x(svptrue_b32(), v105, v81); + svfloat32_t v138 = svadd_f32_x(svptrue_b32(), v65, v137); + svfloat32_t v139 = svsub_f32_x(svptrue_b32(), v65, v137); + svfloat32_t v140 = svadd_f32_x(svptrue_b32(), v113, v89); + svfloat32_t v141 = svsub_f32_x(svptrue_b32(), v113, v89); + svfloat32_t v195 = svadd_f32_x(svptrue_b32(), v191, v193); + svfloat32_t v196 = svsub_f32_x(svptrue_b32(), v191, v193); + svfloat32_t v197 = svadd_f32_x(svptrue_b32(), v192, v194); + svfloat32_t zero220 = svdup_n_f32(0); + svfloat32_t v220 = svcmla_f32_x(pred_full, zero220, v562, v192, 90); + svfloat32_t v248 = svadd_f32_x(svptrue_b32(), v244, v246); + svfloat32_t v249 = svsub_f32_x(svptrue_b32(), v244, v246); + svfloat32_t v250 = svadd_f32_x(svptrue_b32(), v245, v247); + svfloat32_t v287 = svmul_f32_x(svptrue_b32(), v247, v570); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v138, v140); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v138, v140); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v139, v141); + svfloat32_t zero167 = svdup_n_f32(0); + svfloat32_t v167 = svcmla_f32_x(pred_full, zero167, v556, v139, 90); + svfloat32_t v198 = svadd_f32_x(svptrue_b32(), v195, v32); + svfloat32_t v208 = svmul_f32_x(svptrue_b32(), v195, v560); + svfloat32_t zero227 = svdup_n_f32(0); + svfloat32_t v227 = svcmla_f32_x(pred_full, zero227, v563, v197, 90); + svfloat32_t v251 = svadd_f32_x(svptrue_b32(), v248, v33); + svfloat32_t zero272 = svdup_n_f32(0); + svfloat32_t v272 = svcmla_f32_x(pred_full, zero272, v567, v249, 90); + svfloat32_t v282 = svmul_f32_x(svptrue_b32(), v250, v569); + svfloat32_t v145 = svadd_f32_x(svptrue_b32(), v142, v41); + svfloat32_t zero174 = svdup_n_f32(0); + svfloat32_t v174 = svcmla_f32_x(pred_full, zero174, v557, v144, 90); + svfloat32_t v238 = svsub_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v239 = svcmla_f32_x(pred_full, v227, v564, v194, 90); + svfloat32_t zero258 = svdup_n_f32(0); + svfloat32_t v258 = svcmla_f32_x(pred_full, zero258, v565, v251, 90); + svfloat32_t v291 = svnmls_f32_x(pred_full, v282, v245, v568); + svfloat32_t v292 = svmla_f32_x(pred_full, v287, v250, v569); + svfloat32_t v182 = svmla_f32_x(pred_full, v145, v142, v554); + svfloat32_t v185 = svsub_f32_x(svptrue_b32(), v167, v174); + svfloat32_t v186 = svcmla_f32_x(pred_full, v174, v558, v141, 90); + svfloat32_t v235 = svmla_f32_x(pred_full, v208, v198, v559); + svfloat32_t v288 = svcmla_f32_x(pred_full, v258, v566, v248, 90); + svfloat32_t v297 = svmla_f32_x(pred_full, v145, v198, v559); + svst1_f64(pred_full, (double *)(v578), svreinterpret_f64_f32(v145)); + svfloat32_t v183 = svmla_f32_x(pred_full, v182, v143, v555); + svfloat32_t v184 = svmls_f32_x(pred_full, v182, v143, v555); + svfloat32_t v236 = svmla_f32_x(pred_full, v235, v196, v561); + svfloat32_t v237 = svmls_f32_x(pred_full, v235, v196, v561); + svfloat32_t v289 = svadd_f32_x(svptrue_b32(), v288, v272); + svfloat32_t v290 = svsub_f32_x(svptrue_b32(), v288, v272); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v297, v258); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v297, v258); + svfloat32_t v187 = svadd_f32_x(svptrue_b32(), v183, v185); + svfloat32_t v188 = svsub_f32_x(svptrue_b32(), v183, v185); + svfloat32_t v189 = svadd_f32_x(svptrue_b32(), v184, v186); + svfloat32_t v190 = svsub_f32_x(svptrue_b32(), v184, v186); + svfloat32_t v240 = svadd_f32_x(svptrue_b32(), v236, v238); + svfloat32_t v241 = svsub_f32_x(svptrue_b32(), v236, v238); + svfloat32_t v242 = svadd_f32_x(svptrue_b32(), v237, v239); + svfloat32_t v243 = svsub_f32_x(svptrue_b32(), v237, v239); + svfloat32_t v293 = svadd_f32_x(svptrue_b32(), v289, v291); + svfloat32_t v294 = svsub_f32_x(svptrue_b32(), v289, v291); + svfloat32_t v295 = svadd_f32_x(svptrue_b32(), v290, v292); + svfloat32_t v296 = svsub_f32_x(svptrue_b32(), v290, v292); + svst1_f64(pred_full, (double *)(v587), svreinterpret_f64_f32(v299)); + svst1_f64(pred_full, (double *)(v596), svreinterpret_f64_f32(v298)); + svfloat32_t v321 = svadd_f32_x(svptrue_b32(), v188, v241); + svfloat32_t v345 = svadd_f32_x(svptrue_b32(), v190, v243); + svfloat32_t v369 = svadd_f32_x(svptrue_b32(), v189, v242); + svfloat32_t v393 = svadd_f32_x(svptrue_b32(), v187, v240); + svst1_f64(pred_full, (double *)(v605), svreinterpret_f64_f32(v188)); + svst1_f64(pred_full, (double *)(v632), svreinterpret_f64_f32(v190)); + svst1_f64(pred_full, (double *)(v659), svreinterpret_f64_f32(v189)); + svst1_f64(pred_full, (double *)(v686), svreinterpret_f64_f32(v187)); + svfloat32_t v322 = svadd_f32_x(svptrue_b32(), v321, v294); + svfloat32_t v323 = svsub_f32_x(svptrue_b32(), v321, v294); + svfloat32_t v346 = svadd_f32_x(svptrue_b32(), v345, v296); + svfloat32_t v347 = svsub_f32_x(svptrue_b32(), v345, v296); + svfloat32_t v370 = svadd_f32_x(svptrue_b32(), v369, v295); + svfloat32_t v371 = svsub_f32_x(svptrue_b32(), v369, v295); + svfloat32_t v394 = svadd_f32_x(svptrue_b32(), v393, v293); + svfloat32_t v395 = svsub_f32_x(svptrue_b32(), v393, v293); + svst1_f64(pred_full, (double *)(v614), svreinterpret_f64_f32(v323)); + svst1_f64(pred_full, (double *)(v623), svreinterpret_f64_f32(v322)); + svst1_f64(pred_full, (double *)(v641), svreinterpret_f64_f32(v347)); + svst1_f64(pred_full, (double *)(v650), svreinterpret_f64_f32(v346)); + svst1_f64(pred_full, (double *)(v668), svreinterpret_f64_f32(v371)); + svst1_f64(pred_full, (double *)(v677), svreinterpret_f64_f32(v370)); + svst1_f64(pred_full, (double *)(v695), svreinterpret_f64_f32(v395)); + svst1_f64(pred_full, (double *)(v704), svreinterpret_f64_f32(v394)); + v5 += v11; + v6 += v12; + } +} +#endif + #ifndef ARMRAL_ARCH_SVE void armral_fft_cf32_cf32_cf32_ac_n_gu16(const armral_cmplx_f32_t *restrict x, armral_cmplx_f32_t *restrict y, diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.h b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.h index 4db92c18f7a83f84bc13fb321379c304e8e82135..a09608ee02204b95269b505e73ea5187dbdebafa 100644 --- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.h +++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.h @@ -18,8 +18,12 @@ typedef void(cf32_cf32_cf32_ac_n_gu_fft_t)(const armral_cmplx_f32_t *x, int ostride, int howmany, int idist, float dir); +cf32_cf32_cf32_ac_n_gu_fft_t armral_fft_cf32_cf32_cf32_ac_n_gu7; +cf32_cf32_cf32_ac_n_gu_fft_t armral_fft_cf32_cf32_cf32_ac_n_gu9; +cf32_cf32_cf32_ac_n_gu_fft_t armral_fft_cf32_cf32_cf32_ac_n_gu11; cf32_cf32_cf32_ac_n_gu_fft_t armral_fft_cf32_cf32_cf32_ac_n_gu13; cf32_cf32_cf32_ac_n_gu_fft_t armral_fft_cf32_cf32_cf32_ac_n_gu14; +cf32_cf32_cf32_ac_n_gu_fft_t armral_fft_cf32_cf32_cf32_ac_n_gu15; cf32_cf32_cf32_ac_n_gu_fft_t armral_fft_cf32_cf32_cf32_ac_n_gu16; cf32_cf32_cf32_ac_n_gu_fft_t armral_fft_cf32_cf32_cf32_ac_n_gu17; cf32_cf32_cf32_ac_n_gu_fft_t armral_fft_cf32_cf32_cf32_ac_n_gu18; diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gs.c b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gs.c new file mode 100644 index 0000000000000000000000000000000000000000..090862286b428c6f3cacfa9acbb05a60cdcc4c7e --- /dev/null +++ b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gs.c @@ -0,0 +1,18691 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#include "fft_cf32_cf32_cs16_ab_t_gs.h" + +#include +#ifdef ARMRAL_ARCH_SVE +#include +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs2(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v20 = v5[istride]; + float32x2_t v39 = vtrn1_f32(v20, v20); + float32x2_t v40 = vtrn2_f32(v20, v20); + float32x2_t v51 = v5[0]; + float32x2_t v38 = v7[j * 2]; + int64_t v42 = j * 2 + 1; + float32x2_t v43 = v7[v42]; + float32x2_t v44 = vmul_f32(v39, v38); + float32x2_t v46 = vfma_f32(v44, v40, v43); + float32x2_t v52 = vadd_f32(v51, v46); + float32x2_t v53 = vsub_f32(v51, v46); + int16x4_t v64 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v52, 15), (int32x2_t){0, 0})); + int16x4_t v70 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v53, 15), (int32x2_t){0, 0})); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v64), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v70), 0); + v5 += 1 * idist; + v6 += 1 * odist; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs2(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + int64_t v3 = odist; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * v3; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + int64_t v13 = j; + const float32x2_t *v80 = &v5[v0]; + int32_t *v113 = &v6[v2]; + svfloat32_t v37 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v13])); + const float32x2_t *v92 = &v5[0]; + svint64_t v93 = svindex_s64(0, v1); + int32_t *v104 = &v6[0]; + svint64_t v114 = svindex_s64(0, v3); + svfloat32_t v82 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v80), v93)); + svfloat32_t v94 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v92), v93)); + svfloat32_t zero38 = svdup_n_f32(0); + svfloat32_t v38 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero38, v82, v37, 0), v82, v37, 90); + svfloat32_t v46 = svadd_f32_x(svptrue_b32(), v94, v38); + svfloat32_t v47 = svsub_f32_x(svptrue_b32(), v94, v38); + svint16_t v60 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v46, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v68 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v47, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v104), v114, + svreinterpret_u64_s16(v60)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v113), v114, + svreinterpret_u64_s16(v68)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs3(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v20 = v5[istride]; + float v91 = -1.4999999999999998e+00F; + float v94 = 8.6602540378443871e-01F; + float v95 = -8.6602540378443871e-01F; + float32x2_t v97 = (float32x2_t){v4, v4}; + float32x2_t v57 = vtrn1_f32(v20, v20); + float32x2_t v58 = vtrn2_f32(v20, v20); + float32x2_t v84 = v5[0]; + float32x2_t v92 = (float32x2_t){v91, v91}; + float32x2_t v96 = (float32x2_t){v94, v95}; + float32x2_t v38 = v5[istride * 2]; + float32x2_t v56 = v7[j * 4]; + int64_t v60 = j * 4 + 1; + int64_t v68 = 2 + j * 4; + float32x2_t v98 = vmul_f32(v97, v96); + float32x2_t v61 = v7[v60]; + float32x2_t v62 = vmul_f32(v57, v56); + float32x2_t v69 = v7[v68]; + float32x2_t v70 = vtrn1_f32(v38, v38); + float32x2_t v71 = vtrn2_f32(v38, v38); + int64_t v73 = v68 + 1; + float32x2_t v74 = v7[v73]; + float32x2_t v75 = vmul_f32(v70, v69); + float32x2_t v64 = vfma_f32(v62, v58, v61); + float32x2_t v77 = vfma_f32(v75, v71, v74); + float32x2_t v78 = vadd_f32(v64, v77); + float32x2_t v79 = vsub_f32(v64, v77); + float32x2_t v85 = vadd_f32(v78, v84); + float32x2_t v93 = vmul_f32(v78, v92); + float32x2_t v99 = vrev64_f32(v79); + float32x2_t v100 = vmul_f32(v99, v98); + float32x2_t v101 = vadd_f32(v85, v93); + int16x4_t v106 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v85, 15), (int32x2_t){0, 0})); + float32x2_t v102 = vadd_f32(v101, v100); + float32x2_t v103 = vsub_f32(v101, v100); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v106), 0); + int16x4_t v112 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v103, 15), (int32x2_t){0, 0})); + int16x4_t v118 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v102, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v112), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v118), 0); + v5 += 1 * idist; + v6 += 1 * odist; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs3(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + int64_t v3 = odist; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * v3; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + int64_t v13 = j; + float v76 = -1.4999999999999998e+00F; + float v81 = -8.6602540378443871e-01F; + const float32x2_t *v120 = &v5[v0]; + int32_t *v163 = &v6[v2]; + int64_t v33 = v0 * 2; + int64_t v56 = v13 * 2; + float v84 = v4 * v81; + int64_t v107 = v2 * 2; + const float32x2_t *v141 = &v5[0]; + svint64_t v142 = svindex_s64(0, v1); + svfloat32_t v145 = svdup_n_f32(v76); + int32_t *v154 = &v6[0]; + svint64_t v173 = svindex_s64(0, v3); + svfloat32_t v51 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v56])); + int64_t v57 = v10 + v56; + svfloat32_t v122 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v120), v142)); + const float32x2_t *v130 = &v5[v33]; + svfloat32_t v143 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v141), v142)); + svfloat32_t v146 = svdup_n_f32(v84); + int32_t *v172 = &v6[v107]; + svfloat32_t zero52 = svdup_n_f32(0); + svfloat32_t v52 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v122, v51, 0), + v122, v51, 90); + svfloat32_t v58 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v57])); + svfloat32_t v132 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v130), v142)); + svfloat32_t zero59 = svdup_n_f32(0); + svfloat32_t v59 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v132, v58, 0), + v132, v58, 90); + svfloat32_t v60 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v61 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v69 = svadd_f32_x(svptrue_b32(), v60, v143); + svfloat32_t zero86 = svdup_n_f32(0); + svfloat32_t v86 = svcmla_f32_x(pred_full, zero86, v146, v61, 90); + svfloat32_t v87 = svmla_f32_x(pred_full, v69, v60, v145); + svint16_t v92 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v69, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v87, v86); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v87, v86); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v154), v173, + svreinterpret_u64_s16(v92)); + svint16_t v100 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v89, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v108 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v88, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v163), v173, + svreinterpret_u64_s16(v100)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v172), v173, + svreinterpret_u64_s16(v108)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs4(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v51 = v5[istride]; + float v132 = 1.0000000000000000e+00F; + float v133 = -1.0000000000000000e+00F; + float32x2_t v135 = (float32x2_t){v4, v4}; + float32x2_t v88 = vtrn1_f32(v51, v51); + float32x2_t v89 = vtrn2_f32(v51, v51); + float32x2_t v113 = v5[0]; + float32x2_t v134 = (float32x2_t){v132, v133}; + float32x2_t v20 = v5[istride * 2]; + int64_t v37 = 2 + j * 6; + float32x2_t v69 = v5[istride * 3]; + float32x2_t v87 = v7[j * 6]; + int64_t v91 = j * 6 + 1; + int64_t v99 = 4 + j * 6; + float32x2_t v136 = vmul_f32(v135, v134); + float32x2_t v38 = v7[v37]; + float32x2_t v39 = vtrn1_f32(v20, v20); + float32x2_t v40 = vtrn2_f32(v20, v20); + int64_t v42 = v37 + 1; + float32x2_t v92 = v7[v91]; + float32x2_t v93 = vmul_f32(v88, v87); + float32x2_t v100 = v7[v99]; + float32x2_t v101 = vtrn1_f32(v69, v69); + float32x2_t v102 = vtrn2_f32(v69, v69); + int64_t v104 = v99 + 1; + float32x2_t v43 = v7[v42]; + float32x2_t v44 = vmul_f32(v39, v38); + float32x2_t v105 = v7[v104]; + float32x2_t v106 = vmul_f32(v101, v100); + float32x2_t v95 = vfma_f32(v93, v89, v92); + float32x2_t v46 = vfma_f32(v44, v40, v43); + float32x2_t v108 = vfma_f32(v106, v102, v105); + float32x2_t v114 = vadd_f32(v113, v46); + float32x2_t v115 = vsub_f32(v113, v46); + float32x2_t v116 = vadd_f32(v95, v108); + float32x2_t v117 = vsub_f32(v95, v108); + float32x2_t v118 = vadd_f32(v114, v116); + float32x2_t v119 = vsub_f32(v114, v116); + float32x2_t v137 = vrev64_f32(v117); + float32x2_t v138 = vmul_f32(v137, v136); + int16x4_t v143 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v118, 15), (int32x2_t){0, 0})); + int16x4_t v155 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v119, 15), (int32x2_t){0, 0})); + float32x2_t v139 = vadd_f32(v115, v138); + float32x2_t v140 = vsub_f32(v115, v138); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v143), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v155), 0); + int16x4_t v149 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v140, 15), (int32x2_t){0, 0})); + int16x4_t v161 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v139, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v149), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v161), 0); + v5 += 1 * idist; + v6 += 1 * odist; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs4(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + int64_t v3 = odist; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * v3; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + int64_t v13 = j; + float v110 = -1.0000000000000000e+00F; + const float32x2_t *v165 = &v5[v0]; + int32_t *v209 = &v6[v2]; + int64_t v19 = v0 * 2; + int64_t v54 = v0 * 3; + int64_t v76 = v10 * 2; + int64_t v77 = v13 * 3; + float v113 = v4 * v110; + int64_t v135 = v2 * 2; + int64_t v143 = v2 * 3; + const float32x2_t *v186 = &v5[0]; + svint64_t v187 = svindex_s64(0, v1); + int32_t *v200 = &v6[0]; + svint64_t v228 = svindex_s64(0, v3); + int64_t v36 = v10 + v77; + svfloat32_t v72 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v77])); + int64_t v78 = v76 + v77; + const float32x2_t *v156 = &v5[v19]; + svfloat32_t v167 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v165), v187)); + const float32x2_t *v175 = &v5[v54]; + svfloat32_t v188 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v186), v187)); + svfloat32_t v192 = svdup_n_f32(v113); + int32_t *v218 = &v6[v135]; + int32_t *v227 = &v6[v143]; + svfloat32_t v37 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v36])); + svfloat32_t zero73 = svdup_n_f32(0); + svfloat32_t v73 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v167, v72, 0), + v167, v72, 90); + svfloat32_t v79 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v78])); + svfloat32_t v158 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v156), v187)); + svfloat32_t v177 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v175), v187)); + svfloat32_t zero38 = svdup_n_f32(0); + svfloat32_t v38 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v158, v37, 0), + v158, v37, 90); + svfloat32_t zero80 = svdup_n_f32(0); + svfloat32_t v80 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v177, v79, 0), + v177, v79, 90); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v188, v38); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v188, v38); + svfloat32_t v90 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v91 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v92 = svadd_f32_x(svptrue_b32(), v88, v90); + svfloat32_t v93 = svsub_f32_x(svptrue_b32(), v88, v90); + svfloat32_t zero115 = svdup_n_f32(0); + svfloat32_t v115 = svcmla_f32_x(pred_full, zero115, v192, v91, 90); + svfloat32_t v116 = svadd_f32_x(svptrue_b32(), v89, v115); + svfloat32_t v117 = svsub_f32_x(svptrue_b32(), v89, v115); + svint16_t v120 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v92, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v136 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v93, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v128 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v117, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v144 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v116, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v200), v228, + svreinterpret_u64_s16(v120)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v218), v228, + svreinterpret_u64_s16(v136)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v209), v228, + svreinterpret_u64_s16(v128)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v227), v228, + svreinterpret_u64_s16(v144)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs5(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v20 = v5[istride]; + float v158 = -1.2500000000000000e+00F; + float v162 = 5.5901699437494745e-01F; + float v165 = 1.5388417685876268e+00F; + float v166 = -1.5388417685876268e+00F; + float v172 = 5.8778525229247325e-01F; + float v173 = -5.8778525229247325e-01F; + float v179 = 3.6327126400268028e-01F; + float v180 = -3.6327126400268028e-01F; + float32x2_t v182 = (float32x2_t){v4, v4}; + float32x2_t v57 = vtrn1_f32(v20, v20); + float32x2_t v58 = vtrn2_f32(v20, v20); + float32x2_t v151 = v5[0]; + float32x2_t v159 = (float32x2_t){v158, v158}; + float32x2_t v163 = (float32x2_t){v162, v162}; + float32x2_t v167 = (float32x2_t){v165, v166}; + float32x2_t v174 = (float32x2_t){v172, v173}; + float32x2_t v181 = (float32x2_t){v179, v180}; + float32x2_t v38 = v5[istride * 4]; + float32x2_t v56 = v7[j * 8]; + int64_t v60 = j * 8 + 1; + int64_t v68 = 6 + j * 8; + float32x2_t v82 = v5[istride * 3]; + float32x2_t v100 = v5[istride * 2]; + int64_t v117 = 4 + j * 8; + int64_t v130 = 2 + j * 8; + float32x2_t v169 = vmul_f32(v182, v167); + float32x2_t v176 = vmul_f32(v182, v174); + float32x2_t v183 = vmul_f32(v182, v181); + float32x2_t v61 = v7[v60]; + float32x2_t v62 = vmul_f32(v57, v56); + float32x2_t v69 = v7[v68]; + float32x2_t v70 = vtrn1_f32(v38, v38); + float32x2_t v71 = vtrn2_f32(v38, v38); + int64_t v73 = v68 + 1; + float32x2_t v118 = v7[v117]; + float32x2_t v119 = vtrn1_f32(v82, v82); + float32x2_t v120 = vtrn2_f32(v82, v82); + int64_t v122 = v117 + 1; + float32x2_t v131 = v7[v130]; + float32x2_t v132 = vtrn1_f32(v100, v100); + float32x2_t v133 = vtrn2_f32(v100, v100); + int64_t v135 = v130 + 1; + float32x2_t v74 = v7[v73]; + float32x2_t v75 = vmul_f32(v70, v69); + float32x2_t v123 = v7[v122]; + float32x2_t v124 = vmul_f32(v119, v118); + float32x2_t v136 = v7[v135]; + float32x2_t v137 = vmul_f32(v132, v131); + float32x2_t v64 = vfma_f32(v62, v58, v61); + float32x2_t v77 = vfma_f32(v75, v71, v74); + float32x2_t v126 = vfma_f32(v124, v120, v123); + float32x2_t v139 = vfma_f32(v137, v133, v136); + float32x2_t v140 = vadd_f32(v64, v77); + float32x2_t v141 = vsub_f32(v64, v77); + float32x2_t v142 = vadd_f32(v126, v139); + float32x2_t v143 = vsub_f32(v126, v139); + float32x2_t v144 = vadd_f32(v140, v142); + float32x2_t v145 = vsub_f32(v140, v142); + float32x2_t v146 = vadd_f32(v141, v143); + float32x2_t v170 = vrev64_f32(v141); + float32x2_t v184 = vrev64_f32(v143); + float32x2_t v152 = vadd_f32(v144, v151); + float32x2_t v160 = vmul_f32(v144, v159); + float32x2_t v164 = vmul_f32(v145, v163); + float32x2_t v171 = vmul_f32(v170, v169); + float32x2_t v177 = vrev64_f32(v146); + float32x2_t v185 = vmul_f32(v184, v183); + float32x2_t v178 = vmul_f32(v177, v176); + float32x2_t v186 = vadd_f32(v152, v160); + int16x4_t v197 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v152, 15), (int32x2_t){0, 0})); + float32x2_t v187 = vadd_f32(v186, v164); + float32x2_t v188 = vsub_f32(v186, v164); + float32x2_t v189 = vsub_f32(v171, v178); + float32x2_t v190 = vadd_f32(v178, v185); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v197), 0); + float32x2_t v191 = vadd_f32(v187, v189); + float32x2_t v192 = vsub_f32(v187, v189); + float32x2_t v193 = vadd_f32(v188, v190); + float32x2_t v194 = vsub_f32(v188, v190); + int16x4_t v203 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v192, 15), (int32x2_t){0, 0})); + int16x4_t v209 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v194, 15), (int32x2_t){0, 0})); + int16x4_t v215 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v193, 15), (int32x2_t){0, 0})); + int16x4_t v221 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v191, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v203), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v209), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v215), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v221), 0); + v5 += 1 * idist; + v6 += 1 * odist; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs5(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + int64_t v3 = odist; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * v3; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + int64_t v13 = j; + float v123 = -1.2500000000000000e+00F; + float v128 = 5.5901699437494745e-01F; + float v133 = -1.5388417685876268e+00F; + float v140 = -5.8778525229247325e-01F; + float v147 = -3.6327126400268028e-01F; + const float32x2_t *v208 = &v5[v0]; + int32_t *v272 = &v6[v2]; + int64_t v33 = v0 * 4; + int64_t v55 = v10 * 3; + int64_t v61 = v0 * 3; + int64_t v75 = v0 * 2; + int64_t v90 = v10 * 2; + int64_t v98 = v13 * 4; + float v136 = v4 * v133; + float v143 = v4 * v140; + float v150 = v4 * v147; + int64_t v179 = v2 * 2; + int64_t v187 = v2 * 3; + int64_t v195 = v2 * 4; + const float32x2_t *v247 = &v5[0]; + svint64_t v248 = svindex_s64(0, v1); + svfloat32_t v251 = svdup_n_f32(v123); + svfloat32_t v252 = svdup_n_f32(v128); + int32_t *v263 = &v6[0]; + svint64_t v300 = svindex_s64(0, v3); + svfloat32_t v51 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v98])); + int64_t v57 = v55 + v98; + int64_t v92 = v90 + v98; + int64_t v99 = v10 + v98; + svfloat32_t v210 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v208), v248)); + const float32x2_t *v218 = &v5[v33]; + const float32x2_t *v228 = &v5[v61]; + const float32x2_t *v237 = &v5[v75]; + svfloat32_t v249 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v247), v248)); + svfloat32_t v253 = svdup_n_f32(v136); + svfloat32_t v254 = svdup_n_f32(v143); + svfloat32_t v255 = svdup_n_f32(v150); + int32_t *v281 = &v6[v179]; + int32_t *v290 = &v6[v187]; + int32_t *v299 = &v6[v195]; + svfloat32_t zero52 = svdup_n_f32(0); + svfloat32_t v52 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v210, v51, 0), + v210, v51, 90); + svfloat32_t v58 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v57])); + svfloat32_t v93 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v92])); + svfloat32_t v100 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v99])); + svfloat32_t v220 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v218), v248)); + svfloat32_t v230 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v228), v248)); + svfloat32_t v239 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v237), v248)); + svfloat32_t zero59 = svdup_n_f32(0); + svfloat32_t v59 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v220, v58, 0), + v220, v58, 90); + svfloat32_t zero94 = svdup_n_f32(0); + svfloat32_t v94 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v230, v93, 0), + v230, v93, 90); + svfloat32_t zero101 = svdup_n_f32(0); + svfloat32_t v101 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v239, v100, 0), + v239, v100, 90); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v103 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v104 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v105 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v106 = svadd_f32_x(svptrue_b32(), v102, v104); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v102, v104); + svfloat32_t v108 = svadd_f32_x(svptrue_b32(), v103, v105); + svfloat32_t zero138 = svdup_n_f32(0); + svfloat32_t v138 = svcmla_f32_x(pred_full, zero138, v253, v103, 90); + svfloat32_t v116 = svadd_f32_x(svptrue_b32(), v106, v249); + svfloat32_t zero145 = svdup_n_f32(0); + svfloat32_t v145 = svcmla_f32_x(pred_full, zero145, v254, v108, 90); + svfloat32_t v153 = svmla_f32_x(pred_full, v116, v106, v251); + svfloat32_t v156 = svsub_f32_x(svptrue_b32(), v138, v145); + svfloat32_t v157 = svcmla_f32_x(pred_full, v145, v255, v105, 90); + svint16_t v164 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v116, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v154 = svmla_f32_x(pred_full, v153, v107, v252); + svfloat32_t v155 = svmls_f32_x(pred_full, v153, v107, v252); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v263), v300, + svreinterpret_u64_s16(v164)); + svfloat32_t v158 = svadd_f32_x(svptrue_b32(), v154, v156); + svfloat32_t v159 = svsub_f32_x(svptrue_b32(), v154, v156); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v155, v157); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v155, v157); + svint16_t v172 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v159, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v180 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v161, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v188 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v160, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v196 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v158, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v272), v300, + svreinterpret_u64_s16(v172)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v281), v300, + svreinterpret_u64_s16(v180)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v290), v300, + svreinterpret_u64_s16(v188)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v299), v300, + svreinterpret_u64_s16(v196)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs6(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v131 = v5[istride]; + float v211 = -1.4999999999999998e+00F; + float v214 = 8.6602540378443871e-01F; + float v215 = -8.6602540378443871e-01F; + float32x2_t v217 = (float32x2_t){v4, v4}; + float32x2_t v163 = vtrn1_f32(v131, v131); + float32x2_t v164 = vtrn2_f32(v131, v131); + float32x2_t v175 = v5[0]; + float32x2_t v212 = (float32x2_t){v211, v211}; + float32x2_t v216 = (float32x2_t){v214, v215}; + float32x2_t v20 = v5[istride * 3]; + int64_t v37 = 4 + j * 10; + float32x2_t v51 = v5[istride * 2]; + float32x2_t v69 = v5[istride * 5]; + int64_t v86 = 2 + j * 10; + int64_t v99 = 8 + j * 10; + float32x2_t v113 = v5[istride * 4]; + int64_t v148 = 6 + j * 10; + float32x2_t v162 = v7[j * 10]; + int64_t v166 = j * 10 + 1; + float32x2_t v218 = vmul_f32(v217, v216); + float32x2_t v38 = v7[v37]; + float32x2_t v39 = vtrn1_f32(v20, v20); + float32x2_t v40 = vtrn2_f32(v20, v20); + int64_t v42 = v37 + 1; + float32x2_t v87 = v7[v86]; + float32x2_t v88 = vtrn1_f32(v51, v51); + float32x2_t v89 = vtrn2_f32(v51, v51); + int64_t v91 = v86 + 1; + float32x2_t v100 = v7[v99]; + float32x2_t v101 = vtrn1_f32(v69, v69); + float32x2_t v102 = vtrn2_f32(v69, v69); + int64_t v104 = v99 + 1; + float32x2_t v149 = v7[v148]; + float32x2_t v150 = vtrn1_f32(v113, v113); + float32x2_t v151 = vtrn2_f32(v113, v113); + int64_t v153 = v148 + 1; + float32x2_t v167 = v7[v166]; + float32x2_t v168 = vmul_f32(v163, v162); + float32x2_t v43 = v7[v42]; + float32x2_t v44 = vmul_f32(v39, v38); + float32x2_t v92 = v7[v91]; + float32x2_t v93 = vmul_f32(v88, v87); + float32x2_t v105 = v7[v104]; + float32x2_t v106 = vmul_f32(v101, v100); + float32x2_t v154 = v7[v153]; + float32x2_t v155 = vmul_f32(v150, v149); + float32x2_t v170 = vfma_f32(v168, v164, v167); + float32x2_t v46 = vfma_f32(v44, v40, v43); + float32x2_t v95 = vfma_f32(v93, v89, v92); + float32x2_t v108 = vfma_f32(v106, v102, v105); + float32x2_t v157 = vfma_f32(v155, v151, v154); + float32x2_t v176 = vadd_f32(v175, v46); + float32x2_t v177 = vsub_f32(v175, v46); + float32x2_t v178 = vadd_f32(v95, v108); + float32x2_t v179 = vsub_f32(v95, v108); + float32x2_t v180 = vadd_f32(v157, v170); + float32x2_t v181 = vsub_f32(v157, v170); + float32x2_t v182 = vadd_f32(v178, v180); + float32x2_t v183 = vsub_f32(v178, v180); + float32x2_t v203 = vadd_f32(v179, v181); + float32x2_t v204 = vsub_f32(v179, v181); + float32x2_t v184 = vadd_f32(v182, v176); + float32x2_t v192 = vmul_f32(v182, v212); + float32x2_t v198 = vrev64_f32(v183); + float32x2_t v205 = vadd_f32(v203, v177); + float32x2_t v213 = vmul_f32(v203, v212); + float32x2_t v219 = vrev64_f32(v204); + float32x2_t v199 = vmul_f32(v198, v218); + float32x2_t v200 = vadd_f32(v184, v192); + float32x2_t v220 = vmul_f32(v219, v218); + float32x2_t v221 = vadd_f32(v205, v213); + int16x4_t v226 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v184, 15), (int32x2_t){0, 0})); + int16x4_t v232 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v205, 15), (int32x2_t){0, 0})); + float32x2_t v201 = vadd_f32(v200, v199); + float32x2_t v202 = vsub_f32(v200, v199); + float32x2_t v222 = vadd_f32(v221, v220); + float32x2_t v223 = vsub_f32(v221, v220); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v226), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v232), 0); + int16x4_t v238 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v202, 15), (int32x2_t){0, 0})); + int16x4_t v244 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v223, 15), (int32x2_t){0, 0})); + int16x4_t v250 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v201, 15), (int32x2_t){0, 0})); + int16x4_t v256 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v222, 15), (int32x2_t){0, 0})); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v238), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v244), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v250), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v256), 0); + v5 += 1 * idist; + v6 += 1 * odist; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs6(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + int64_t v3 = odist; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * v3; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + int64_t v13 = j; + float v168 = -1.4999999999999998e+00F; + float v173 = -8.6602540378443871e-01F; + const float32x2_t *v272 = &v5[v0]; + int32_t *v327 = &v6[v2]; + int64_t v19 = v0 * 3; + int64_t v34 = v10 * 2; + int64_t v40 = v0 * 2; + int64_t v54 = v0 * 5; + int64_t v76 = v10 * 4; + int64_t v82 = v0 * 4; + int64_t v111 = v10 * 3; + int64_t v119 = v13 * 5; + float v176 = v4 * v173; + int64_t v191 = v2 * 3; + int64_t v199 = v2 * 4; + int64_t v215 = v2 * 2; + int64_t v223 = v2 * 5; + const float32x2_t *v284 = &v5[0]; + svint64_t v285 = svindex_s64(0, v1); + svfloat32_t v291 = svdup_n_f32(v168); + int32_t *v300 = &v6[0]; + svint64_t v346 = svindex_s64(0, v3); + int64_t v36 = v34 + v119; + int64_t v71 = v10 + v119; + int64_t v78 = v76 + v119; + int64_t v113 = v111 + v119; + svfloat32_t v121 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v119])); + const float32x2_t *v236 = &v5[v19]; + const float32x2_t *v245 = &v5[v40]; + const float32x2_t *v254 = &v5[v54]; + const float32x2_t *v263 = &v5[v82]; + svfloat32_t v274 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v272), v285)); + svfloat32_t v286 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v284), v285)); + svfloat32_t v292 = svdup_n_f32(v176); + int32_t *v309 = &v6[v191]; + int32_t *v318 = &v6[v199]; + int32_t *v336 = &v6[v215]; + int32_t *v345 = &v6[v223]; + svfloat32_t v37 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v36])); + svfloat32_t v72 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v71])); + svfloat32_t v79 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v78])); + svfloat32_t v114 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v113])); + svfloat32_t zero122 = svdup_n_f32(0); + svfloat32_t v122 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v274, v121, 0), + v274, v121, 90); + svfloat32_t v238 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v236), v285)); + svfloat32_t v247 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v245), v285)); + svfloat32_t v256 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v254), v285)); + svfloat32_t v265 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v263), v285)); + svfloat32_t zero38 = svdup_n_f32(0); + svfloat32_t v38 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v238, v37, 0), + v238, v37, 90); + svfloat32_t zero73 = svdup_n_f32(0); + svfloat32_t v73 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v247, v72, 0), + v247, v72, 90); + svfloat32_t zero80 = svdup_n_f32(0); + svfloat32_t v80 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v256, v79, 0), + v256, v79, 90); + svfloat32_t zero115 = svdup_n_f32(0); + svfloat32_t v115 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero115, v265, v114, 0), + v265, v114, 90); + svfloat32_t v130 = svadd_f32_x(svptrue_b32(), v286, v38); + svfloat32_t v131 = svsub_f32_x(svptrue_b32(), v286, v38); + svfloat32_t v132 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v133 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v134 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v135 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v136 = svadd_f32_x(svptrue_b32(), v132, v134); + svfloat32_t v137 = svsub_f32_x(svptrue_b32(), v132, v134); + svfloat32_t v159 = svadd_f32_x(svptrue_b32(), v133, v135); + svfloat32_t v160 = svsub_f32_x(svptrue_b32(), v133, v135); + svfloat32_t v138 = svadd_f32_x(svptrue_b32(), v136, v130); + svfloat32_t zero155 = svdup_n_f32(0); + svfloat32_t v155 = svcmla_f32_x(pred_full, zero155, v292, v137, 90); + svfloat32_t v161 = svadd_f32_x(svptrue_b32(), v159, v131); + svfloat32_t zero178 = svdup_n_f32(0); + svfloat32_t v178 = svcmla_f32_x(pred_full, zero178, v292, v160, 90); + svfloat32_t v156 = svmla_f32_x(pred_full, v138, v136, v291); + svfloat32_t v179 = svmla_f32_x(pred_full, v161, v159, v291); + svint16_t v184 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v138, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v192 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v161, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v157 = svadd_f32_x(svptrue_b32(), v156, v155); + svfloat32_t v158 = svsub_f32_x(svptrue_b32(), v156, v155); + svfloat32_t v180 = svadd_f32_x(svptrue_b32(), v179, v178); + svfloat32_t v181 = svsub_f32_x(svptrue_b32(), v179, v178); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v300), v346, + svreinterpret_u64_s16(v184)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v309), v346, + svreinterpret_u64_s16(v192)); + svint16_t v200 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v158, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v208 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v181, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v216 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v157, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v224 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v180, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v318), v346, + svreinterpret_u64_s16(v200)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v327), v346, + svreinterpret_u64_s16(v208)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v336), v346, + svreinterpret_u64_s16(v216)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v345), v346, + svreinterpret_u64_s16(v224)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs7(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v20 = v5[istride]; + float v229 = -1.1666666666666665e+00F; + float v233 = 7.9015646852540022e-01F; + float v237 = 5.5854267289647742e-02F; + float v241 = 7.3430220123575241e-01F; + float v244 = 4.4095855184409838e-01F; + float v245 = -4.4095855184409838e-01F; + float v251 = 3.4087293062393137e-01F; + float v252 = -3.4087293062393137e-01F; + float v258 = -5.3396936033772524e-01F; + float v259 = 5.3396936033772524e-01F; + float v265 = 8.7484229096165667e-01F; + float v266 = -8.7484229096165667e-01F; + float32x2_t v268 = (float32x2_t){v4, v4}; + float32x2_t v57 = vtrn1_f32(v20, v20); + float32x2_t v58 = vtrn2_f32(v20, v20); + float32x2_t v214 = v5[0]; + float32x2_t v230 = (float32x2_t){v229, v229}; + float32x2_t v234 = (float32x2_t){v233, v233}; + float32x2_t v238 = (float32x2_t){v237, v237}; + float32x2_t v242 = (float32x2_t){v241, v241}; + float32x2_t v246 = (float32x2_t){v244, v245}; + float32x2_t v253 = (float32x2_t){v251, v252}; + float32x2_t v260 = (float32x2_t){v258, v259}; + float32x2_t v267 = (float32x2_t){v265, v266}; + float32x2_t v38 = v5[istride * 6]; + float32x2_t v56 = v7[j * 12]; + int64_t v60 = j * 12 + 1; + int64_t v68 = 10 + j * 12; + float32x2_t v82 = v5[istride * 4]; + float32x2_t v100 = v5[istride * 3]; + int64_t v117 = 6 + j * 12; + int64_t v130 = 4 + j * 12; + float32x2_t v144 = v5[istride * 2]; + float32x2_t v162 = v5[istride * 5]; + int64_t v179 = 2 + j * 12; + int64_t v192 = 8 + j * 12; + float32x2_t v248 = vmul_f32(v268, v246); + float32x2_t v255 = vmul_f32(v268, v253); + float32x2_t v262 = vmul_f32(v268, v260); + float32x2_t v269 = vmul_f32(v268, v267); + float32x2_t v61 = v7[v60]; + float32x2_t v62 = vmul_f32(v57, v56); + float32x2_t v69 = v7[v68]; + float32x2_t v70 = vtrn1_f32(v38, v38); + float32x2_t v71 = vtrn2_f32(v38, v38); + int64_t v73 = v68 + 1; + float32x2_t v118 = v7[v117]; + float32x2_t v119 = vtrn1_f32(v82, v82); + float32x2_t v120 = vtrn2_f32(v82, v82); + int64_t v122 = v117 + 1; + float32x2_t v131 = v7[v130]; + float32x2_t v132 = vtrn1_f32(v100, v100); + float32x2_t v133 = vtrn2_f32(v100, v100); + int64_t v135 = v130 + 1; + float32x2_t v180 = v7[v179]; + float32x2_t v181 = vtrn1_f32(v144, v144); + float32x2_t v182 = vtrn2_f32(v144, v144); + int64_t v184 = v179 + 1; + float32x2_t v193 = v7[v192]; + float32x2_t v194 = vtrn1_f32(v162, v162); + float32x2_t v195 = vtrn2_f32(v162, v162); + int64_t v197 = v192 + 1; + float32x2_t v74 = v7[v73]; + float32x2_t v75 = vmul_f32(v70, v69); + float32x2_t v123 = v7[v122]; + float32x2_t v124 = vmul_f32(v119, v118); + float32x2_t v136 = v7[v135]; + float32x2_t v137 = vmul_f32(v132, v131); + float32x2_t v185 = v7[v184]; + float32x2_t v186 = vmul_f32(v181, v180); + float32x2_t v198 = v7[v197]; + float32x2_t v199 = vmul_f32(v194, v193); + float32x2_t v64 = vfma_f32(v62, v58, v61); + float32x2_t v77 = vfma_f32(v75, v71, v74); + float32x2_t v126 = vfma_f32(v124, v120, v123); + float32x2_t v139 = vfma_f32(v137, v133, v136); + float32x2_t v188 = vfma_f32(v186, v182, v185); + float32x2_t v201 = vfma_f32(v199, v195, v198); + float32x2_t v202 = vadd_f32(v64, v77); + float32x2_t v203 = vsub_f32(v64, v77); + float32x2_t v204 = vadd_f32(v126, v139); + float32x2_t v205 = vsub_f32(v126, v139); + float32x2_t v206 = vadd_f32(v188, v201); + float32x2_t v207 = vsub_f32(v188, v201); + float32x2_t v208 = vadd_f32(v202, v204); + float32x2_t v216 = vsub_f32(v202, v204); + float32x2_t v217 = vsub_f32(v204, v206); + float32x2_t v218 = vsub_f32(v206, v202); + float32x2_t v219 = vadd_f32(v203, v205); + float32x2_t v221 = vsub_f32(v203, v205); + float32x2_t v222 = vsub_f32(v205, v207); + float32x2_t v223 = vsub_f32(v207, v203); + float32x2_t v209 = vadd_f32(v208, v206); + float32x2_t v220 = vadd_f32(v219, v207); + float32x2_t v235 = vmul_f32(v216, v234); + float32x2_t v239 = vmul_f32(v217, v238); + float32x2_t v243 = vmul_f32(v218, v242); + float32x2_t v256 = vrev64_f32(v221); + float32x2_t v263 = vrev64_f32(v222); + float32x2_t v270 = vrev64_f32(v223); + float32x2_t v215 = vadd_f32(v209, v214); + float32x2_t v231 = vmul_f32(v209, v230); + float32x2_t v249 = vrev64_f32(v220); + float32x2_t v257 = vmul_f32(v256, v255); + float32x2_t v264 = vmul_f32(v263, v262); + float32x2_t v271 = vmul_f32(v270, v269); + float32x2_t v250 = vmul_f32(v249, v248); + float32x2_t v272 = vadd_f32(v215, v231); + int16x4_t v293 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v215, 15), (int32x2_t){0, 0})); + float32x2_t v273 = vadd_f32(v272, v235); + float32x2_t v275 = vsub_f32(v272, v235); + float32x2_t v277 = vsub_f32(v272, v239); + float32x2_t v279 = vadd_f32(v250, v257); + float32x2_t v281 = vsub_f32(v250, v257); + float32x2_t v283 = vsub_f32(v250, v264); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v293), 0); + float32x2_t v274 = vadd_f32(v273, v239); + float32x2_t v276 = vsub_f32(v275, v243); + float32x2_t v278 = vadd_f32(v277, v243); + float32x2_t v280 = vadd_f32(v279, v264); + float32x2_t v282 = vsub_f32(v281, v271); + float32x2_t v284 = vadd_f32(v283, v271); + float32x2_t v285 = vadd_f32(v274, v280); + float32x2_t v286 = vsub_f32(v274, v280); + float32x2_t v287 = vadd_f32(v276, v282); + float32x2_t v288 = vsub_f32(v276, v282); + float32x2_t v289 = vadd_f32(v278, v284); + float32x2_t v290 = vsub_f32(v278, v284); + int16x4_t v299 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v286, 15), (int32x2_t){0, 0})); + int16x4_t v305 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v288, 15), (int32x2_t){0, 0})); + int16x4_t v311 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v289, 15), (int32x2_t){0, 0})); + int16x4_t v317 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v290, 15), (int32x2_t){0, 0})); + int16x4_t v323 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v287, 15), (int32x2_t){0, 0})); + int16x4_t v329 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v285, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v299), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v305), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v311), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v317), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v323), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v329), 0); + v5 += 1 * idist; + v6 += 1 * odist; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs7(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + int64_t v3 = odist; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * v3; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + int64_t v13 = j; + float v174 = -1.1666666666666665e+00F; + float v179 = 7.9015646852540022e-01F; + float v184 = 5.5854267289647742e-02F; + float v189 = 7.3430220123575241e-01F; + float v194 = -4.4095855184409838e-01F; + float v201 = -3.4087293062393137e-01F; + float v208 = 5.3396936033772524e-01F; + float v215 = -8.7484229096165667e-01F; + const float32x2_t *v302 = &v5[v0]; + int32_t *v387 = &v6[v2]; + int64_t v33 = v0 * 6; + int64_t v55 = v10 * 5; + int64_t v61 = v0 * 4; + int64_t v75 = v0 * 3; + int64_t v90 = v10 * 3; + int64_t v97 = v10 * 2; + int64_t v103 = v0 * 2; + int64_t v117 = v0 * 5; + int64_t v139 = v10 * 4; + int64_t v140 = v13 * 6; + float v197 = v4 * v194; + float v204 = v4 * v201; + float v211 = v4 * v208; + float v218 = v4 * v215; + int64_t v257 = v2 * 2; + int64_t v265 = v2 * 3; + int64_t v273 = v2 * 4; + int64_t v281 = v2 * 5; + int64_t v289 = v2 * 6; + const float32x2_t *v359 = &v5[0]; + svint64_t v360 = svindex_s64(0, v1); + svfloat32_t v363 = svdup_n_f32(v174); + svfloat32_t v364 = svdup_n_f32(v179); + svfloat32_t v365 = svdup_n_f32(v184); + svfloat32_t v366 = svdup_n_f32(v189); + int32_t *v378 = &v6[0]; + svint64_t v433 = svindex_s64(0, v3); + svfloat32_t v51 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v140])); + int64_t v57 = v55 + v140; + int64_t v92 = v90 + v140; + int64_t v99 = v97 + v140; + int64_t v134 = v10 + v140; + int64_t v141 = v139 + v140; + svfloat32_t v304 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v302), v360)); + const float32x2_t *v312 = &v5[v33]; + const float32x2_t *v322 = &v5[v61]; + const float32x2_t *v331 = &v5[v75]; + const float32x2_t *v340 = &v5[v103]; + const float32x2_t *v349 = &v5[v117]; + svfloat32_t v361 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v359), v360)); + svfloat32_t v367 = svdup_n_f32(v197); + svfloat32_t v368 = svdup_n_f32(v204); + svfloat32_t v369 = svdup_n_f32(v211); + svfloat32_t v370 = svdup_n_f32(v218); + int32_t *v396 = &v6[v257]; + int32_t *v405 = &v6[v265]; + int32_t *v414 = &v6[v273]; + int32_t *v423 = &v6[v281]; + int32_t *v432 = &v6[v289]; + svfloat32_t zero52 = svdup_n_f32(0); + svfloat32_t v52 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v304, v51, 0), + v304, v51, 90); + svfloat32_t v58 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v57])); + svfloat32_t v93 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v92])); + svfloat32_t v100 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v99])); + svfloat32_t v135 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v134])); + svfloat32_t v142 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v141])); + svfloat32_t v314 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v312), v360)); + svfloat32_t v324 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v322), v360)); + svfloat32_t v333 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v331), v360)); + svfloat32_t v342 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v340), v360)); + svfloat32_t v351 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v349), v360)); + svfloat32_t zero59 = svdup_n_f32(0); + svfloat32_t v59 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v314, v58, 0), + v314, v58, 90); + svfloat32_t zero94 = svdup_n_f32(0); + svfloat32_t v94 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v324, v93, 0), + v324, v93, 90); + svfloat32_t zero101 = svdup_n_f32(0); + svfloat32_t v101 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v333, v100, 0), + v333, v100, 90); + svfloat32_t zero136 = svdup_n_f32(0); + svfloat32_t v136 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero136, v342, v135, 0), + v342, v135, 90); + svfloat32_t zero143 = svdup_n_f32(0); + svfloat32_t v143 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero143, v351, v142, 0), + v351, v142, 90); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v146 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v147 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v149 = svsub_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v150 = svadd_f32_x(svptrue_b32(), v144, v146); + svfloat32_t v160 = svsub_f32_x(svptrue_b32(), v144, v146); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v146, v148); + svfloat32_t v162 = svsub_f32_x(svptrue_b32(), v148, v144); + svfloat32_t v163 = svadd_f32_x(svptrue_b32(), v145, v147); + svfloat32_t v165 = svsub_f32_x(svptrue_b32(), v145, v147); + svfloat32_t v166 = svsub_f32_x(svptrue_b32(), v147, v149); + svfloat32_t v167 = svsub_f32_x(svptrue_b32(), v149, v145); + svfloat32_t v151 = svadd_f32_x(svptrue_b32(), v150, v148); + svfloat32_t v164 = svadd_f32_x(svptrue_b32(), v163, v149); + svfloat32_t zero206 = svdup_n_f32(0); + svfloat32_t v206 = svcmla_f32_x(pred_full, zero206, v368, v165, 90); + svfloat32_t zero213 = svdup_n_f32(0); + svfloat32_t v213 = svcmla_f32_x(pred_full, zero213, v369, v166, 90); + svfloat32_t zero220 = svdup_n_f32(0); + svfloat32_t v220 = svcmla_f32_x(pred_full, zero220, v370, v167, 90); + svfloat32_t v159 = svadd_f32_x(svptrue_b32(), v151, v361); + svfloat32_t zero199 = svdup_n_f32(0); + svfloat32_t v199 = svcmla_f32_x(pred_full, zero199, v367, v164, 90); + svfloat32_t v221 = svmla_f32_x(pred_full, v159, v151, v363); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v230 = svsub_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v232 = svsub_f32_x(svptrue_b32(), v199, v213); + svint16_t v242 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v159, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v222 = svmla_f32_x(pred_full, v221, v160, v364); + svfloat32_t v224 = svmls_f32_x(pred_full, v221, v160, v364); + svfloat32_t v226 = svmls_f32_x(pred_full, v221, v161, v365); + svfloat32_t v229 = svadd_f32_x(svptrue_b32(), v228, v213); + svfloat32_t v231 = svsub_f32_x(svptrue_b32(), v230, v220); + svfloat32_t v233 = svadd_f32_x(svptrue_b32(), v232, v220); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v378), v433, + svreinterpret_u64_s16(v242)); + svfloat32_t v223 = svmla_f32_x(pred_full, v222, v161, v365); + svfloat32_t v225 = svmls_f32_x(pred_full, v224, v162, v366); + svfloat32_t v227 = svmla_f32_x(pred_full, v226, v162, v366); + svfloat32_t v234 = svadd_f32_x(svptrue_b32(), v223, v229); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v223, v229); + svfloat32_t v236 = svadd_f32_x(svptrue_b32(), v225, v231); + svfloat32_t v237 = svsub_f32_x(svptrue_b32(), v225, v231); + svfloat32_t v238 = svadd_f32_x(svptrue_b32(), v227, v233); + svfloat32_t v239 = svsub_f32_x(svptrue_b32(), v227, v233); + svint16_t v250 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v235, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v258 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v237, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v266 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v238, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v274 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v239, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v282 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v236, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v290 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v234, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v387), v433, + svreinterpret_u64_s16(v250)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v396), v433, + svreinterpret_u64_s16(v258)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v405), v433, + svreinterpret_u64_s16(v266)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v414), v433, + svreinterpret_u64_s16(v274)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v423), v433, + svreinterpret_u64_s16(v282)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v432), v433, + svreinterpret_u64_s16(v290)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs8(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v113 = v5[istride]; + float v277 = 1.0000000000000000e+00F; + float v278 = -1.0000000000000000e+00F; + float v285 = -7.0710678118654746e-01F; + float32x2_t v287 = (float32x2_t){v4, v4}; + float v292 = 7.0710678118654757e-01F; + float32x2_t v150 = vtrn1_f32(v113, v113); + float32x2_t v151 = vtrn2_f32(v113, v113); + float32x2_t v237 = v5[0]; + float32x2_t v279 = (float32x2_t){v277, v278}; + float32x2_t v286 = (float32x2_t){v292, v285}; + float32x2_t v293 = (float32x2_t){v292, v292}; + float32x2_t v20 = v5[istride * 4]; + int64_t v37 = 6 + j * 14; + float32x2_t v51 = v5[istride * 2]; + float32x2_t v69 = v5[istride * 6]; + int64_t v86 = 2 + j * 14; + int64_t v99 = 10 + j * 14; + float32x2_t v131 = v5[istride * 5]; + float32x2_t v149 = v7[j * 14]; + int64_t v153 = j * 14 + 1; + int64_t v161 = 8 + j * 14; + float32x2_t v175 = v5[istride * 3]; + float32x2_t v193 = v5[istride * 7]; + int64_t v210 = 4 + j * 14; + int64_t v223 = 12 + j * 14; + float32x2_t v281 = vmul_f32(v287, v279); + float32x2_t v288 = vmul_f32(v287, v286); + float32x2_t v38 = v7[v37]; + float32x2_t v39 = vtrn1_f32(v20, v20); + float32x2_t v40 = vtrn2_f32(v20, v20); + int64_t v42 = v37 + 1; + float32x2_t v87 = v7[v86]; + float32x2_t v88 = vtrn1_f32(v51, v51); + float32x2_t v89 = vtrn2_f32(v51, v51); + int64_t v91 = v86 + 1; + float32x2_t v100 = v7[v99]; + float32x2_t v101 = vtrn1_f32(v69, v69); + float32x2_t v102 = vtrn2_f32(v69, v69); + int64_t v104 = v99 + 1; + float32x2_t v154 = v7[v153]; + float32x2_t v155 = vmul_f32(v150, v149); + float32x2_t v162 = v7[v161]; + float32x2_t v163 = vtrn1_f32(v131, v131); + float32x2_t v164 = vtrn2_f32(v131, v131); + int64_t v166 = v161 + 1; + float32x2_t v211 = v7[v210]; + float32x2_t v212 = vtrn1_f32(v175, v175); + float32x2_t v213 = vtrn2_f32(v175, v175); + int64_t v215 = v210 + 1; + float32x2_t v224 = v7[v223]; + float32x2_t v225 = vtrn1_f32(v193, v193); + float32x2_t v226 = vtrn2_f32(v193, v193); + int64_t v228 = v223 + 1; + float32x2_t v43 = v7[v42]; + float32x2_t v44 = vmul_f32(v39, v38); + float32x2_t v92 = v7[v91]; + float32x2_t v93 = vmul_f32(v88, v87); + float32x2_t v105 = v7[v104]; + float32x2_t v106 = vmul_f32(v101, v100); + float32x2_t v167 = v7[v166]; + float32x2_t v168 = vmul_f32(v163, v162); + float32x2_t v216 = v7[v215]; + float32x2_t v217 = vmul_f32(v212, v211); + float32x2_t v229 = v7[v228]; + float32x2_t v230 = vmul_f32(v225, v224); + float32x2_t v157 = vfma_f32(v155, v151, v154); + float32x2_t v46 = vfma_f32(v44, v40, v43); + float32x2_t v95 = vfma_f32(v93, v89, v92); + float32x2_t v108 = vfma_f32(v106, v102, v105); + float32x2_t v170 = vfma_f32(v168, v164, v167); + float32x2_t v219 = vfma_f32(v217, v213, v216); + float32x2_t v232 = vfma_f32(v230, v226, v229); + float32x2_t v238 = vadd_f32(v237, v46); + float32x2_t v239 = vsub_f32(v237, v46); + float32x2_t v240 = vadd_f32(v95, v108); + float32x2_t v241 = vsub_f32(v95, v108); + float32x2_t v242 = vadd_f32(v157, v170); + float32x2_t v243 = vsub_f32(v157, v170); + float32x2_t v244 = vadd_f32(v219, v232); + float32x2_t v245 = vsub_f32(v219, v232); + float32x2_t v246 = vadd_f32(v238, v240); + float32x2_t v247 = vsub_f32(v238, v240); + float32x2_t v248 = vadd_f32(v242, v244); + float32x2_t v249 = vsub_f32(v242, v244); + float32x2_t v252 = vadd_f32(v243, v245); + float32x2_t v253 = vsub_f32(v243, v245); + float32x2_t v282 = vrev64_f32(v241); + float32x2_t v250 = vadd_f32(v246, v248); + float32x2_t v251 = vsub_f32(v246, v248); + float32x2_t v271 = vrev64_f32(v249); + float32x2_t v283 = vmul_f32(v282, v281); + float32x2_t v289 = vrev64_f32(v252); + float32x2_t v294 = vmul_f32(v253, v293); + float32x2_t v272 = vmul_f32(v271, v281); + float32x2_t v290 = vmul_f32(v289, v288); + float32x2_t v297 = vadd_f32(v239, v294); + float32x2_t v298 = vsub_f32(v239, v294); + int16x4_t v307 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v250, 15), (int32x2_t){0, 0})); + int16x4_t v331 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v251, 15), (int32x2_t){0, 0})); + float32x2_t v295 = vadd_f32(v247, v272); + float32x2_t v296 = vsub_f32(v247, v272); + float32x2_t v299 = vadd_f32(v283, v290); + float32x2_t v300 = vsub_f32(v283, v290); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v307), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v331), 0); + float32x2_t v301 = vadd_f32(v297, v299); + float32x2_t v302 = vsub_f32(v297, v299); + float32x2_t v303 = vadd_f32(v298, v300); + float32x2_t v304 = vsub_f32(v298, v300); + int16x4_t v319 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v296, 15), (int32x2_t){0, 0})); + int16x4_t v343 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v295, 15), (int32x2_t){0, 0})); + int16x4_t v313 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v302, 15), (int32x2_t){0, 0})); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v319), 0); + int16x4_t v325 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v303, 15), (int32x2_t){0, 0})); + int16x4_t v337 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v304, 15), (int32x2_t){0, 0})); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v343), 0); + int16x4_t v349 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v301, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v313), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v325), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v337), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v349), 0); + v5 += 1 * idist; + v6 += 1 * odist; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs8(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + int64_t v3 = odist; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * v3; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + int64_t v13 = j; + float v216 = -1.0000000000000000e+00F; + float v223 = -7.0710678118654746e-01F; + float v230 = 7.0710678118654757e-01F; + const float32x2_t *v341 = &v5[v0]; + int32_t *v407 = &v6[v2]; + int64_t v19 = v0 * 4; + int64_t v34 = v10 * 3; + int64_t v40 = v0 * 2; + int64_t v54 = v0 * 6; + int64_t v76 = v10 * 5; + int64_t v96 = v0 * 5; + int64_t v118 = v10 * 4; + int64_t v124 = v0 * 3; + int64_t v138 = v0 * 7; + int64_t v153 = v10 * 2; + int64_t v160 = v10 * 6; + int64_t v161 = v13 * 7; + float v219 = v4 * v216; + float v226 = v4 * v223; + int64_t v261 = v2 * 2; + int64_t v269 = v2 * 3; + int64_t v277 = v2 * 4; + int64_t v285 = v2 * 5; + int64_t v293 = v2 * 6; + int64_t v301 = v2 * 7; + const float32x2_t *v380 = &v5[0]; + svint64_t v381 = svindex_s64(0, v1); + svfloat32_t v390 = svdup_n_f32(v230); + int32_t *v398 = &v6[0]; + svint64_t v462 = svindex_s64(0, v3); + int64_t v36 = v34 + v161; + int64_t v71 = v10 + v161; + int64_t v78 = v76 + v161; + svfloat32_t v114 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v161])); + int64_t v120 = v118 + v161; + int64_t v155 = v153 + v161; + int64_t v162 = v160 + v161; + const float32x2_t *v314 = &v5[v19]; + const float32x2_t *v323 = &v5[v40]; + const float32x2_t *v332 = &v5[v54]; + svfloat32_t v343 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v341), v381)); + const float32x2_t *v351 = &v5[v96]; + const float32x2_t *v361 = &v5[v124]; + const float32x2_t *v370 = &v5[v138]; + svfloat32_t v382 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v380), v381)); + svfloat32_t v388 = svdup_n_f32(v219); + svfloat32_t v389 = svdup_n_f32(v226); + int32_t *v416 = &v6[v261]; + int32_t *v425 = &v6[v269]; + int32_t *v434 = &v6[v277]; + int32_t *v443 = &v6[v285]; + int32_t *v452 = &v6[v293]; + int32_t *v461 = &v6[v301]; + svfloat32_t v37 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v36])); + svfloat32_t v72 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v71])); + svfloat32_t v79 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v78])); + svfloat32_t zero115 = svdup_n_f32(0); + svfloat32_t v115 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero115, v343, v114, 0), + v343, v114, 90); + svfloat32_t v121 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v120])); + svfloat32_t v156 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v155])); + svfloat32_t v163 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v162])); + svfloat32_t v316 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v314), v381)); + svfloat32_t v325 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v323), v381)); + svfloat32_t v334 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v332), v381)); + svfloat32_t v353 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v351), v381)); + svfloat32_t v363 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v361), v381)); + svfloat32_t v372 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v370), v381)); + svfloat32_t zero38 = svdup_n_f32(0); + svfloat32_t v38 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v316, v37, 0), + v316, v37, 90); + svfloat32_t zero73 = svdup_n_f32(0); + svfloat32_t v73 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v325, v72, 0), + v325, v72, 90); + svfloat32_t zero80 = svdup_n_f32(0); + svfloat32_t v80 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v334, v79, 0), + v334, v79, 90); + svfloat32_t zero122 = svdup_n_f32(0); + svfloat32_t v122 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v353, v121, 0), + v353, v121, 90); + svfloat32_t zero157 = svdup_n_f32(0); + svfloat32_t v157 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v363, v156, 0), + v363, v156, 90); + svfloat32_t zero164 = svdup_n_f32(0); + svfloat32_t v164 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero164, v372, v163, 0), + v372, v163, 90); + svfloat32_t v172 = svadd_f32_x(svptrue_b32(), v382, v38); + svfloat32_t v173 = svsub_f32_x(svptrue_b32(), v382, v38); + svfloat32_t v174 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v175 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v178 = svadd_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v180 = svadd_f32_x(svptrue_b32(), v172, v174); + svfloat32_t v181 = svsub_f32_x(svptrue_b32(), v172, v174); + svfloat32_t v182 = svadd_f32_x(svptrue_b32(), v176, v178); + svfloat32_t v183 = svsub_f32_x(svptrue_b32(), v176, v178); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v177, v179); + svfloat32_t v187 = svsub_f32_x(svptrue_b32(), v177, v179); + svfloat32_t zero221 = svdup_n_f32(0); + svfloat32_t v221 = svcmla_f32_x(pred_full, zero221, v388, v175, 90); + svfloat32_t v184 = svadd_f32_x(svptrue_b32(), v180, v182); + svfloat32_t v185 = svsub_f32_x(svptrue_b32(), v180, v182); + svfloat32_t zero209 = svdup_n_f32(0); + svfloat32_t v209 = svcmla_f32_x(pred_full, zero209, v388, v183, 90); + svfloat32_t zero228 = svdup_n_f32(0); + svfloat32_t v228 = svcmla_f32_x(pred_full, zero228, v389, v186, 90); + svfloat32_t v234 = svadd_f32_x(svptrue_b32(), v181, v209); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v181, v209); + svfloat32_t v236 = svmla_f32_x(pred_full, v173, v187, v390); + svfloat32_t v237 = svmls_f32_x(pred_full, v173, v187, v390); + svfloat32_t v238 = svadd_f32_x(svptrue_b32(), v221, v228); + svfloat32_t v239 = svsub_f32_x(svptrue_b32(), v221, v228); + svint16_t v246 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v184, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v278 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v185, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v240 = svadd_f32_x(svptrue_b32(), v236, v238); + svfloat32_t v241 = svsub_f32_x(svptrue_b32(), v236, v238); + svfloat32_t v242 = svadd_f32_x(svptrue_b32(), v237, v239); + svfloat32_t v243 = svsub_f32_x(svptrue_b32(), v237, v239); + svint16_t v262 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v235, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v294 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v234, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v398), v462, + svreinterpret_u64_s16(v246)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v434), v462, + svreinterpret_u64_s16(v278)); + svint16_t v254 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v241, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v270 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v242, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v286 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v243, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v302 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v240, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v416), v462, + svreinterpret_u64_s16(v262)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v452), v462, + svreinterpret_u64_s16(v294)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v407), v462, + svreinterpret_u64_s16(v254)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v425), v462, + svreinterpret_u64_s16(v270)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v443), v462, + svreinterpret_u64_s16(v286)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v461), v462, + svreinterpret_u64_s16(v302)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs9(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v20 = v5[istride]; + float v294 = -5.0000000000000000e-01F; + float v305 = -1.4999999999999998e+00F; + float v308 = 8.6602540378443871e-01F; + float v309 = -8.6602540378443871e-01F; + float v316 = 7.6604444311897801e-01F; + float v320 = 9.3969262078590832e-01F; + float v324 = -1.7364817766693039e-01F; + float v327 = 6.4278760968653925e-01F; + float v328 = -6.4278760968653925e-01F; + float v334 = -3.4202014332566888e-01F; + float v335 = 3.4202014332566888e-01F; + float v341 = 9.8480775301220802e-01F; + float v342 = -9.8480775301220802e-01F; + float32x2_t v344 = (float32x2_t){v4, v4}; + float32x2_t v57 = vtrn1_f32(v20, v20); + float32x2_t v58 = vtrn2_f32(v20, v20); + float32x2_t v279 = v5[0]; + float32x2_t v295 = (float32x2_t){v294, v294}; + float32x2_t v306 = (float32x2_t){v305, v305}; + float32x2_t v310 = (float32x2_t){v308, v309}; + float32x2_t v317 = (float32x2_t){v316, v316}; + float32x2_t v321 = (float32x2_t){v320, v320}; + float32x2_t v325 = (float32x2_t){v324, v324}; + float32x2_t v329 = (float32x2_t){v327, v328}; + float32x2_t v336 = (float32x2_t){v334, v335}; + float32x2_t v343 = (float32x2_t){v341, v342}; + float32x2_t v38 = v5[istride * 8]; + float32x2_t v56 = v7[j * 16]; + int64_t v60 = j * 16 + 1; + int64_t v68 = 14 + j * 16; + float32x2_t v82 = v5[istride * 7]; + float32x2_t v100 = v5[istride * 2]; + int64_t v117 = 12 + j * 16; + int64_t v130 = 2 + j * 16; + float32x2_t v144 = v5[istride * 3]; + float32x2_t v162 = v5[istride * 6]; + int64_t v179 = 4 + j * 16; + int64_t v192 = 10 + j * 16; + float32x2_t v206 = v5[istride * 4]; + float32x2_t v224 = v5[istride * 5]; + int64_t v241 = 6 + j * 16; + int64_t v254 = 8 + j * 16; + float32x2_t v312 = vmul_f32(v344, v310); + float32x2_t v331 = vmul_f32(v344, v329); + float32x2_t v338 = vmul_f32(v344, v336); + float32x2_t v345 = vmul_f32(v344, v343); + float32x2_t v61 = v7[v60]; + float32x2_t v62 = vmul_f32(v57, v56); + float32x2_t v69 = v7[v68]; + float32x2_t v70 = vtrn1_f32(v38, v38); + float32x2_t v71 = vtrn2_f32(v38, v38); + int64_t v73 = v68 + 1; + float32x2_t v118 = v7[v117]; + float32x2_t v119 = vtrn1_f32(v82, v82); + float32x2_t v120 = vtrn2_f32(v82, v82); + int64_t v122 = v117 + 1; + float32x2_t v131 = v7[v130]; + float32x2_t v132 = vtrn1_f32(v100, v100); + float32x2_t v133 = vtrn2_f32(v100, v100); + int64_t v135 = v130 + 1; + float32x2_t v180 = v7[v179]; + float32x2_t v181 = vtrn1_f32(v144, v144); + float32x2_t v182 = vtrn2_f32(v144, v144); + int64_t v184 = v179 + 1; + float32x2_t v193 = v7[v192]; + float32x2_t v194 = vtrn1_f32(v162, v162); + float32x2_t v195 = vtrn2_f32(v162, v162); + int64_t v197 = v192 + 1; + float32x2_t v242 = v7[v241]; + float32x2_t v243 = vtrn1_f32(v206, v206); + float32x2_t v244 = vtrn2_f32(v206, v206); + int64_t v246 = v241 + 1; + float32x2_t v255 = v7[v254]; + float32x2_t v256 = vtrn1_f32(v224, v224); + float32x2_t v257 = vtrn2_f32(v224, v224); + int64_t v259 = v254 + 1; + float32x2_t v74 = v7[v73]; + float32x2_t v75 = vmul_f32(v70, v69); + float32x2_t v123 = v7[v122]; + float32x2_t v124 = vmul_f32(v119, v118); + float32x2_t v136 = v7[v135]; + float32x2_t v137 = vmul_f32(v132, v131); + float32x2_t v185 = v7[v184]; + float32x2_t v186 = vmul_f32(v181, v180); + float32x2_t v198 = v7[v197]; + float32x2_t v199 = vmul_f32(v194, v193); + float32x2_t v247 = v7[v246]; + float32x2_t v248 = vmul_f32(v243, v242); + float32x2_t v260 = v7[v259]; + float32x2_t v261 = vmul_f32(v256, v255); + float32x2_t v64 = vfma_f32(v62, v58, v61); + float32x2_t v77 = vfma_f32(v75, v71, v74); + float32x2_t v126 = vfma_f32(v124, v120, v123); + float32x2_t v139 = vfma_f32(v137, v133, v136); + float32x2_t v188 = vfma_f32(v186, v182, v185); + float32x2_t v201 = vfma_f32(v199, v195, v198); + float32x2_t v250 = vfma_f32(v248, v244, v247); + float32x2_t v263 = vfma_f32(v261, v257, v260); + float32x2_t v264 = vadd_f32(v64, v77); + float32x2_t v265 = vsub_f32(v64, v77); + float32x2_t v266 = vadd_f32(v126, v139); + float32x2_t v267 = vsub_f32(v126, v139); + float32x2_t v268 = vadd_f32(v188, v201); + float32x2_t v269 = vsub_f32(v188, v201); + float32x2_t v270 = vadd_f32(v250, v263); + float32x2_t v271 = vsub_f32(v250, v263); + float32x2_t v272 = vadd_f32(v264, v266); + float32x2_t v281 = vadd_f32(v265, v267); + float32x2_t v283 = vsub_f32(v264, v266); + float32x2_t v284 = vsub_f32(v266, v270); + float32x2_t v285 = vsub_f32(v270, v264); + float32x2_t v286 = vsub_f32(v265, v267); + float32x2_t v287 = vsub_f32(v267, v271); + float32x2_t v288 = vsub_f32(v271, v265); + float32x2_t v307 = vmul_f32(v268, v306); + float32x2_t v313 = vrev64_f32(v269); + float32x2_t v273 = vadd_f32(v272, v270); + float32x2_t v282 = vadd_f32(v281, v271); + float32x2_t v314 = vmul_f32(v313, v312); + float32x2_t v318 = vmul_f32(v283, v317); + float32x2_t v322 = vmul_f32(v284, v321); + float32x2_t v326 = vmul_f32(v285, v325); + float32x2_t v332 = vrev64_f32(v286); + float32x2_t v339 = vrev64_f32(v287); + float32x2_t v346 = vrev64_f32(v288); + float32x2_t v274 = vadd_f32(v273, v268); + float32x2_t v296 = vmul_f32(v273, v295); + float32x2_t v302 = vrev64_f32(v282); + float32x2_t v333 = vmul_f32(v332, v331); + float32x2_t v340 = vmul_f32(v339, v338); + float32x2_t v347 = vmul_f32(v346, v345); + float32x2_t v280 = vadd_f32(v274, v279); + float32x2_t v303 = vmul_f32(v302, v312); + float32x2_t v348 = vadd_f32(v296, v296); + float32x2_t v361 = vadd_f32(v314, v333); + float32x2_t v363 = vsub_f32(v314, v340); + float32x2_t v365 = vsub_f32(v314, v333); + float32x2_t v349 = vadd_f32(v348, v296); + float32x2_t v353 = vadd_f32(v280, v307); + float32x2_t v362 = vadd_f32(v361, v340); + float32x2_t v364 = vadd_f32(v363, v347); + float32x2_t v366 = vsub_f32(v365, v347); + int16x4_t v375 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v280, 15), (int32x2_t){0, 0})); + float32x2_t v350 = vadd_f32(v280, v349); + float32x2_t v354 = vadd_f32(v353, v348); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v375), 0); + float32x2_t v351 = vadd_f32(v350, v303); + float32x2_t v352 = vsub_f32(v350, v303); + float32x2_t v355 = vadd_f32(v354, v318); + float32x2_t v357 = vsub_f32(v354, v322); + float32x2_t v359 = vsub_f32(v354, v318); + float32x2_t v356 = vadd_f32(v355, v322); + float32x2_t v358 = vadd_f32(v357, v326); + float32x2_t v360 = vsub_f32(v359, v326); + int16x4_t v393 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v352, 15), (int32x2_t){0, 0})); + int16x4_t v411 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v351, 15), (int32x2_t){0, 0})); + float32x2_t v367 = vadd_f32(v356, v362); + float32x2_t v368 = vsub_f32(v356, v362); + float32x2_t v369 = vadd_f32(v358, v364); + float32x2_t v370 = vsub_f32(v358, v364); + float32x2_t v371 = vadd_f32(v360, v366); + float32x2_t v372 = vsub_f32(v360, v366); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v393), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v411), 0); + int16x4_t v381 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v368, 15), (int32x2_t){0, 0})); + int16x4_t v387 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v369, 15), (int32x2_t){0, 0})); + int16x4_t v399 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v372, 15), (int32x2_t){0, 0})); + int16x4_t v405 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v371, 15), (int32x2_t){0, 0})); + int16x4_t v417 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v370, 15), (int32x2_t){0, 0})); + int16x4_t v423 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v367, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v381), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v387), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v399), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v405), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v417), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v423), 0); + v5 += 1 * idist; + v6 += 1 * odist; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs9(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + int64_t v3 = odist; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * v3; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + int64_t v13 = j; + float v219 = -5.0000000000000000e-01F; + float v231 = -1.4999999999999998e+00F; + float v236 = -8.6602540378443871e-01F; + float v243 = 7.6604444311897801e-01F; + float v248 = 9.3969262078590832e-01F; + float v253 = -1.7364817766693039e-01F; + float v258 = -6.4278760968653925e-01F; + float v265 = 3.4202014332566888e-01F; + float v272 = -9.8480775301220802e-01F; + const float32x2_t *v381 = &v5[v0]; + int32_t *v486 = &v6[v2]; + int64_t v33 = v0 * 8; + int64_t v55 = v10 * 7; + int64_t v61 = v0 * 7; + int64_t v75 = v0 * 2; + int64_t v90 = v10 * 6; + int64_t v103 = v0 * 3; + int64_t v117 = v0 * 6; + int64_t v132 = v10 * 2; + int64_t v139 = v10 * 5; + int64_t v145 = v0 * 4; + int64_t v159 = v0 * 5; + int64_t v174 = v10 * 3; + int64_t v181 = v10 * 4; + int64_t v182 = v13 * 8; + float v239 = v4 * v236; + float v261 = v4 * v258; + float v268 = v4 * v265; + float v275 = v4 * v272; + int64_t v320 = v2 * 2; + int64_t v328 = v2 * 3; + int64_t v336 = v2 * 4; + int64_t v344 = v2 * 5; + int64_t v352 = v2 * 6; + int64_t v360 = v2 * 7; + int64_t v368 = v2 * 8; + const float32x2_t *v456 = &v5[0]; + svint64_t v457 = svindex_s64(0, v1); + svfloat32_t v460 = svdup_n_f32(v219); + svfloat32_t v462 = svdup_n_f32(v231); + svfloat32_t v464 = svdup_n_f32(v243); + svfloat32_t v465 = svdup_n_f32(v248); + svfloat32_t v466 = svdup_n_f32(v253); + int32_t *v477 = &v6[0]; + svint64_t v550 = svindex_s64(0, v3); + svfloat32_t v51 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v182])); + int64_t v57 = v55 + v182; + int64_t v92 = v90 + v182; + int64_t v99 = v10 + v182; + int64_t v134 = v132 + v182; + int64_t v141 = v139 + v182; + int64_t v176 = v174 + v182; + int64_t v183 = v181 + v182; + svfloat32_t v383 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v381), v457)); + const float32x2_t *v391 = &v5[v33]; + const float32x2_t *v401 = &v5[v61]; + const float32x2_t *v410 = &v5[v75]; + const float32x2_t *v419 = &v5[v103]; + const float32x2_t *v428 = &v5[v117]; + const float32x2_t *v437 = &v5[v145]; + const float32x2_t *v446 = &v5[v159]; + svfloat32_t v458 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v456), v457)); + svfloat32_t v463 = svdup_n_f32(v239); + svfloat32_t v467 = svdup_n_f32(v261); + svfloat32_t v468 = svdup_n_f32(v268); + svfloat32_t v469 = svdup_n_f32(v275); + int32_t *v495 = &v6[v320]; + int32_t *v504 = &v6[v328]; + int32_t *v513 = &v6[v336]; + int32_t *v522 = &v6[v344]; + int32_t *v531 = &v6[v352]; + int32_t *v540 = &v6[v360]; + int32_t *v549 = &v6[v368]; + svfloat32_t zero52 = svdup_n_f32(0); + svfloat32_t v52 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v383, v51, 0), + v383, v51, 90); + svfloat32_t v58 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v57])); + svfloat32_t v93 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v92])); + svfloat32_t v100 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v99])); + svfloat32_t v135 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v134])); + svfloat32_t v142 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v141])); + svfloat32_t v177 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v176])); + svfloat32_t v184 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v183])); + svfloat32_t v393 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v391), v457)); + svfloat32_t v403 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v401), v457)); + svfloat32_t v412 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v410), v457)); + svfloat32_t v421 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v419), v457)); + svfloat32_t v430 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v428), v457)); + svfloat32_t v439 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v437), v457)); + svfloat32_t v448 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v446), v457)); + svfloat32_t zero59 = svdup_n_f32(0); + svfloat32_t v59 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v393, v58, 0), + v393, v58, 90); + svfloat32_t zero94 = svdup_n_f32(0); + svfloat32_t v94 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v403, v93, 0), + v403, v93, 90); + svfloat32_t zero101 = svdup_n_f32(0); + svfloat32_t v101 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v412, v100, 0), + v412, v100, 90); + svfloat32_t zero136 = svdup_n_f32(0); + svfloat32_t v136 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero136, v421, v135, 0), + v421, v135, 90); + svfloat32_t zero143 = svdup_n_f32(0); + svfloat32_t v143 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero143, v430, v142, 0), + v430, v142, 90); + svfloat32_t zero178 = svdup_n_f32(0); + svfloat32_t v178 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero178, v439, v177, 0), + v439, v177, 90); + svfloat32_t zero185 = svdup_n_f32(0); + svfloat32_t v185 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero185, v448, v184, 0), + v448, v184, 90); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v187 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v188 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v189 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v190 = svadd_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v191 = svsub_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v193 = svsub_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v194 = svadd_f32_x(svptrue_b32(), v186, v188); + svfloat32_t v205 = svadd_f32_x(svptrue_b32(), v187, v189); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v186, v188); + svfloat32_t v208 = svsub_f32_x(svptrue_b32(), v188, v192); + svfloat32_t v209 = svsub_f32_x(svptrue_b32(), v192, v186); + svfloat32_t v210 = svsub_f32_x(svptrue_b32(), v187, v189); + svfloat32_t v211 = svsub_f32_x(svptrue_b32(), v189, v193); + svfloat32_t v212 = svsub_f32_x(svptrue_b32(), v193, v187); + svfloat32_t zero241 = svdup_n_f32(0); + svfloat32_t v241 = svcmla_f32_x(pred_full, zero241, v463, v191, 90); + svfloat32_t v195 = svadd_f32_x(svptrue_b32(), v194, v192); + svfloat32_t v206 = svadd_f32_x(svptrue_b32(), v205, v193); + svfloat32_t zero263 = svdup_n_f32(0); + svfloat32_t v263 = svcmla_f32_x(pred_full, zero263, v467, v210, 90); + svfloat32_t zero270 = svdup_n_f32(0); + svfloat32_t v270 = svcmla_f32_x(pred_full, zero270, v468, v211, 90); + svfloat32_t zero277 = svdup_n_f32(0); + svfloat32_t v277 = svcmla_f32_x(pred_full, zero277, v469, v212, 90); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v195, v190); + svfloat32_t v222 = svmul_f32_x(svptrue_b32(), v195, v460); + svfloat32_t zero229 = svdup_n_f32(0); + svfloat32_t v229 = svcmla_f32_x(pred_full, zero229, v463, v206, 90); + svfloat32_t v291 = svadd_f32_x(svptrue_b32(), v241, v263); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v241, v270); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v241, v263); + svfloat32_t v204 = svadd_f32_x(svptrue_b32(), v196, v458); + svfloat32_t v278 = svadd_f32_x(svptrue_b32(), v222, v222); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v291, v270); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v293, v277); + svfloat32_t v296 = svsub_f32_x(svptrue_b32(), v295, v277); + svfloat32_t v279 = svmla_f32_x(pred_full, v278, v195, v460); + svfloat32_t v283 = svmla_f32_x(pred_full, v204, v190, v462); + svint16_t v305 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v204, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v280 = svadd_f32_x(svptrue_b32(), v204, v279); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v283, v278); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v477), v550, + svreinterpret_u64_s16(v305)); + svfloat32_t v281 = svadd_f32_x(svptrue_b32(), v280, v229); + svfloat32_t v282 = svsub_f32_x(svptrue_b32(), v280, v229); + svfloat32_t v285 = svmla_f32_x(pred_full, v284, v207, v464); + svfloat32_t v287 = svmls_f32_x(pred_full, v284, v208, v465); + svfloat32_t v289 = svmls_f32_x(pred_full, v284, v207, v464); + svfloat32_t v286 = svmla_f32_x(pred_full, v285, v208, v465); + svfloat32_t v288 = svmla_f32_x(pred_full, v287, v209, v466); + svfloat32_t v290 = svmls_f32_x(pred_full, v289, v209, v466); + svint16_t v329 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v282, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v353 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v281, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v297 = svadd_f32_x(svptrue_b32(), v286, v292); + svfloat32_t v298 = svsub_f32_x(svptrue_b32(), v286, v292); + svfloat32_t v299 = svadd_f32_x(svptrue_b32(), v288, v294); + svfloat32_t v300 = svsub_f32_x(svptrue_b32(), v288, v294); + svfloat32_t v301 = svadd_f32_x(svptrue_b32(), v290, v296); + svfloat32_t v302 = svsub_f32_x(svptrue_b32(), v290, v296); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v504), v550, + svreinterpret_u64_s16(v329)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v531), v550, + svreinterpret_u64_s16(v353)); + svint16_t v313 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v298, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v321 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v299, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v337 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v302, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v345 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v301, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v361 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v300, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v369 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v297, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v486), v550, + svreinterpret_u64_s16(v313)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v495), v550, + svreinterpret_u64_s16(v321)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v513), v550, + svreinterpret_u64_s16(v337)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v522), v550, + svreinterpret_u64_s16(v345)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v540), v550, + svreinterpret_u64_s16(v361)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v549), v550, + svreinterpret_u64_s16(v369)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs10(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v193 = v5[istride]; + float v373 = -1.2500000000000000e+00F; + float v377 = 5.5901699437494745e-01F; + float v380 = 1.5388417685876268e+00F; + float v381 = -1.5388417685876268e+00F; + float v387 = 5.8778525229247325e-01F; + float v388 = -5.8778525229247325e-01F; + float v394 = 3.6327126400268028e-01F; + float v395 = -3.6327126400268028e-01F; + float32x2_t v397 = (float32x2_t){v4, v4}; + float32x2_t v225 = vtrn1_f32(v193, v193); + float32x2_t v226 = vtrn2_f32(v193, v193); + float32x2_t v299 = v5[0]; + float32x2_t v374 = (float32x2_t){v373, v373}; + float32x2_t v378 = (float32x2_t){v377, v377}; + float32x2_t v382 = (float32x2_t){v380, v381}; + float32x2_t v389 = (float32x2_t){v387, v388}; + float32x2_t v396 = (float32x2_t){v394, v395}; + float32x2_t v20 = v5[istride * 5]; + int64_t v37 = 8 + j * 18; + float32x2_t v51 = v5[istride * 2]; + float32x2_t v69 = v5[istride * 7]; + int64_t v86 = 2 + j * 18; + int64_t v99 = 12 + j * 18; + float32x2_t v113 = v5[istride * 4]; + float32x2_t v131 = v5[istride * 9]; + int64_t v148 = 6 + j * 18; + int64_t v161 = 16 + j * 18; + float32x2_t v175 = v5[istride * 6]; + int64_t v210 = 10 + j * 18; + float32x2_t v224 = v7[j * 18]; + int64_t v228 = j * 18 + 1; + float32x2_t v237 = v5[istride * 8]; + float32x2_t v255 = v5[istride * 3]; + int64_t v272 = 14 + j * 18; + int64_t v285 = 4 + j * 18; + float32x2_t v384 = vmul_f32(v397, v382); + float32x2_t v391 = vmul_f32(v397, v389); + float32x2_t v398 = vmul_f32(v397, v396); + float32x2_t v38 = v7[v37]; + float32x2_t v39 = vtrn1_f32(v20, v20); + float32x2_t v40 = vtrn2_f32(v20, v20); + int64_t v42 = v37 + 1; + float32x2_t v87 = v7[v86]; + float32x2_t v88 = vtrn1_f32(v51, v51); + float32x2_t v89 = vtrn2_f32(v51, v51); + int64_t v91 = v86 + 1; + float32x2_t v100 = v7[v99]; + float32x2_t v101 = vtrn1_f32(v69, v69); + float32x2_t v102 = vtrn2_f32(v69, v69); + int64_t v104 = v99 + 1; + float32x2_t v149 = v7[v148]; + float32x2_t v150 = vtrn1_f32(v113, v113); + float32x2_t v151 = vtrn2_f32(v113, v113); + int64_t v153 = v148 + 1; + float32x2_t v162 = v7[v161]; + float32x2_t v163 = vtrn1_f32(v131, v131); + float32x2_t v164 = vtrn2_f32(v131, v131); + int64_t v166 = v161 + 1; + float32x2_t v211 = v7[v210]; + float32x2_t v212 = vtrn1_f32(v175, v175); + float32x2_t v213 = vtrn2_f32(v175, v175); + int64_t v215 = v210 + 1; + float32x2_t v229 = v7[v228]; + float32x2_t v230 = vmul_f32(v225, v224); + float32x2_t v273 = v7[v272]; + float32x2_t v274 = vtrn1_f32(v237, v237); + float32x2_t v275 = vtrn2_f32(v237, v237); + int64_t v277 = v272 + 1; + float32x2_t v286 = v7[v285]; + float32x2_t v287 = vtrn1_f32(v255, v255); + float32x2_t v288 = vtrn2_f32(v255, v255); + int64_t v290 = v285 + 1; + float32x2_t v43 = v7[v42]; + float32x2_t v44 = vmul_f32(v39, v38); + float32x2_t v92 = v7[v91]; + float32x2_t v93 = vmul_f32(v88, v87); + float32x2_t v105 = v7[v104]; + float32x2_t v106 = vmul_f32(v101, v100); + float32x2_t v154 = v7[v153]; + float32x2_t v155 = vmul_f32(v150, v149); + float32x2_t v167 = v7[v166]; + float32x2_t v168 = vmul_f32(v163, v162); + float32x2_t v216 = v7[v215]; + float32x2_t v217 = vmul_f32(v212, v211); + float32x2_t v278 = v7[v277]; + float32x2_t v279 = vmul_f32(v274, v273); + float32x2_t v291 = v7[v290]; + float32x2_t v292 = vmul_f32(v287, v286); + float32x2_t v232 = vfma_f32(v230, v226, v229); + float32x2_t v46 = vfma_f32(v44, v40, v43); + float32x2_t v95 = vfma_f32(v93, v89, v92); + float32x2_t v108 = vfma_f32(v106, v102, v105); + float32x2_t v157 = vfma_f32(v155, v151, v154); + float32x2_t v170 = vfma_f32(v168, v164, v167); + float32x2_t v219 = vfma_f32(v217, v213, v216); + float32x2_t v281 = vfma_f32(v279, v275, v278); + float32x2_t v294 = vfma_f32(v292, v288, v291); + float32x2_t v300 = vadd_f32(v299, v46); + float32x2_t v301 = vsub_f32(v299, v46); + float32x2_t v302 = vadd_f32(v95, v108); + float32x2_t v303 = vsub_f32(v95, v108); + float32x2_t v304 = vadd_f32(v157, v170); + float32x2_t v305 = vsub_f32(v157, v170); + float32x2_t v306 = vadd_f32(v219, v232); + float32x2_t v307 = vsub_f32(v219, v232); + float32x2_t v308 = vadd_f32(v281, v294); + float32x2_t v309 = vsub_f32(v281, v294); + float32x2_t v310 = vadd_f32(v302, v308); + float32x2_t v311 = vsub_f32(v302, v308); + float32x2_t v312 = vadd_f32(v306, v304); + float32x2_t v313 = vsub_f32(v306, v304); + float32x2_t v360 = vadd_f32(v303, v309); + float32x2_t v361 = vsub_f32(v303, v309); + float32x2_t v362 = vadd_f32(v307, v305); + float32x2_t v363 = vsub_f32(v307, v305); + float32x2_t v314 = vadd_f32(v310, v312); + float32x2_t v315 = vsub_f32(v310, v312); + float32x2_t v316 = vadd_f32(v311, v313); + float32x2_t v335 = vrev64_f32(v311); + float32x2_t v349 = vrev64_f32(v313); + float32x2_t v364 = vadd_f32(v360, v362); + float32x2_t v365 = vsub_f32(v360, v362); + float32x2_t v366 = vadd_f32(v361, v363); + float32x2_t v385 = vrev64_f32(v361); + float32x2_t v399 = vrev64_f32(v363); + float32x2_t v317 = vadd_f32(v314, v300); + float32x2_t v325 = vmul_f32(v314, v374); + float32x2_t v329 = vmul_f32(v315, v378); + float32x2_t v336 = vmul_f32(v335, v384); + float32x2_t v342 = vrev64_f32(v316); + float32x2_t v350 = vmul_f32(v349, v398); + float32x2_t v367 = vadd_f32(v364, v301); + float32x2_t v375 = vmul_f32(v364, v374); + float32x2_t v379 = vmul_f32(v365, v378); + float32x2_t v386 = vmul_f32(v385, v384); + float32x2_t v392 = vrev64_f32(v366); + float32x2_t v400 = vmul_f32(v399, v398); + float32x2_t v343 = vmul_f32(v342, v391); + float32x2_t v351 = vadd_f32(v317, v325); + float32x2_t v393 = vmul_f32(v392, v391); + float32x2_t v401 = vadd_f32(v367, v375); + int16x4_t v412 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v317, 15), (int32x2_t){0, 0})); + int16x4_t v418 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v367, 15), (int32x2_t){0, 0})); + float32x2_t v352 = vadd_f32(v351, v329); + float32x2_t v353 = vsub_f32(v351, v329); + float32x2_t v354 = vsub_f32(v336, v343); + float32x2_t v355 = vadd_f32(v343, v350); + float32x2_t v402 = vadd_f32(v401, v379); + float32x2_t v403 = vsub_f32(v401, v379); + float32x2_t v404 = vsub_f32(v386, v393); + float32x2_t v405 = vadd_f32(v393, v400); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v412), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v418), 0); + float32x2_t v356 = vadd_f32(v352, v354); + float32x2_t v357 = vsub_f32(v352, v354); + float32x2_t v358 = vadd_f32(v353, v355); + float32x2_t v359 = vsub_f32(v353, v355); + float32x2_t v406 = vadd_f32(v402, v404); + float32x2_t v407 = vsub_f32(v402, v404); + float32x2_t v408 = vadd_f32(v403, v405); + float32x2_t v409 = vsub_f32(v403, v405); + int16x4_t v424 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v357, 15), (int32x2_t){0, 0})); + int16x4_t v430 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v407, 15), (int32x2_t){0, 0})); + int16x4_t v436 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v359, 15), (int32x2_t){0, 0})); + int16x4_t v442 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v409, 15), (int32x2_t){0, 0})); + int16x4_t v448 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v358, 15), (int32x2_t){0, 0})); + int16x4_t v454 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v408, 15), (int32x2_t){0, 0})); + int16x4_t v460 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v356, 15), (int32x2_t){0, 0})); + int16x4_t v466 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v406, 15), (int32x2_t){0, 0})); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v424), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v430), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v436), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v442), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v448), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v454), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v460), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v466), 0); + v5 += 1 * idist; + v6 += 1 * odist; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs10(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + int64_t v3 = odist; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * v3; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + int64_t v13 = j; + float v291 = -1.2500000000000000e+00F; + float v296 = 5.5901699437494745e-01F; + float v301 = -1.5388417685876268e+00F; + float v308 = -5.8778525229247325e-01F; + float v315 = -3.6327126400268028e-01F; + const float32x2_t *v470 = &v5[v0]; + int32_t *v549 = &v6[v2]; + int64_t v19 = v0 * 5; + int64_t v34 = v10 * 4; + int64_t v40 = v0 * 2; + int64_t v54 = v0 * 7; + int64_t v76 = v10 * 6; + int64_t v82 = v0 * 4; + int64_t v96 = v0 * 9; + int64_t v111 = v10 * 3; + int64_t v118 = v10 * 8; + int64_t v124 = v0 * 6; + int64_t v153 = v10 * 5; + int64_t v166 = v0 * 8; + int64_t v180 = v0 * 3; + int64_t v195 = v10 * 7; + int64_t v202 = v10 * 2; + int64_t v203 = v13 * 9; + float v304 = v4 * v301; + float v311 = v4 * v308; + float v318 = v4 * v315; + int64_t v339 = v2 * 5; + int64_t v347 = v2 * 6; + int64_t v363 = v2 * 2; + int64_t v371 = v2 * 7; + int64_t v379 = v2 * 8; + int64_t v387 = v2 * 3; + int64_t v395 = v2 * 4; + int64_t v403 = v2 * 9; + const float32x2_t *v500 = &v5[0]; + svint64_t v501 = svindex_s64(0, v1); + svfloat32_t v510 = svdup_n_f32(v291); + svfloat32_t v511 = svdup_n_f32(v296); + int32_t *v522 = &v6[0]; + svint64_t v604 = svindex_s64(0, v3); + int64_t v36 = v34 + v203; + int64_t v71 = v10 + v203; + int64_t v78 = v76 + v203; + int64_t v113 = v111 + v203; + int64_t v120 = v118 + v203; + int64_t v155 = v153 + v203; + svfloat32_t v163 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v203])); + int64_t v197 = v195 + v203; + int64_t v204 = v202 + v203; + const float32x2_t *v416 = &v5[v19]; + const float32x2_t *v425 = &v5[v40]; + const float32x2_t *v434 = &v5[v54]; + const float32x2_t *v443 = &v5[v82]; + const float32x2_t *v452 = &v5[v96]; + const float32x2_t *v461 = &v5[v124]; + svfloat32_t v472 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v470), v501)); + const float32x2_t *v481 = &v5[v166]; + const float32x2_t *v490 = &v5[v180]; + svfloat32_t v502 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v500), v501)); + svfloat32_t v512 = svdup_n_f32(v304); + svfloat32_t v513 = svdup_n_f32(v311); + svfloat32_t v514 = svdup_n_f32(v318); + int32_t *v531 = &v6[v339]; + int32_t *v540 = &v6[v347]; + int32_t *v558 = &v6[v363]; + int32_t *v567 = &v6[v371]; + int32_t *v576 = &v6[v379]; + int32_t *v585 = &v6[v387]; + int32_t *v594 = &v6[v395]; + int32_t *v603 = &v6[v403]; + svfloat32_t v37 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v36])); + svfloat32_t v72 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v71])); + svfloat32_t v79 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v78])); + svfloat32_t v114 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v113])); + svfloat32_t v121 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v120])); + svfloat32_t v156 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v155])); + svfloat32_t zero164 = svdup_n_f32(0); + svfloat32_t v164 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero164, v472, v163, 0), + v472, v163, 90); + svfloat32_t v198 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v197])); + svfloat32_t v205 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v204])); + svfloat32_t v418 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v416), v501)); + svfloat32_t v427 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v425), v501)); + svfloat32_t v436 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v434), v501)); + svfloat32_t v445 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v443), v501)); + svfloat32_t v454 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v452), v501)); + svfloat32_t v463 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v461), v501)); + svfloat32_t v483 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v481), v501)); + svfloat32_t v492 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v490), v501)); + svfloat32_t zero38 = svdup_n_f32(0); + svfloat32_t v38 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v418, v37, 0), + v418, v37, 90); + svfloat32_t zero73 = svdup_n_f32(0); + svfloat32_t v73 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v427, v72, 0), + v427, v72, 90); + svfloat32_t zero80 = svdup_n_f32(0); + svfloat32_t v80 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v436, v79, 0), + v436, v79, 90); + svfloat32_t zero115 = svdup_n_f32(0); + svfloat32_t v115 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero115, v445, v114, 0), + v445, v114, 90); + svfloat32_t zero122 = svdup_n_f32(0); + svfloat32_t v122 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v454, v121, 0), + v454, v121, 90); + svfloat32_t zero157 = svdup_n_f32(0); + svfloat32_t v157 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v463, v156, 0), + v463, v156, 90); + svfloat32_t zero199 = svdup_n_f32(0); + svfloat32_t v199 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero199, v483, v198, 0), + v483, v198, 90); + svfloat32_t zero206 = svdup_n_f32(0); + svfloat32_t v206 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v492, v205, 0), + v492, v205, 90); + svfloat32_t v214 = svadd_f32_x(svptrue_b32(), v502, v38); + svfloat32_t v215 = svsub_f32_x(svptrue_b32(), v502, v38); + svfloat32_t v216 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v217 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v218 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v219 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v220 = svadd_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v222 = svadd_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v224 = svadd_f32_x(svptrue_b32(), v216, v222); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v216, v222); + svfloat32_t v226 = svadd_f32_x(svptrue_b32(), v220, v218); + svfloat32_t v227 = svsub_f32_x(svptrue_b32(), v220, v218); + svfloat32_t v277 = svadd_f32_x(svptrue_b32(), v217, v223); + svfloat32_t v278 = svsub_f32_x(svptrue_b32(), v217, v223); + svfloat32_t v279 = svadd_f32_x(svptrue_b32(), v221, v219); + svfloat32_t v280 = svsub_f32_x(svptrue_b32(), v221, v219); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v224, v226); + svfloat32_t v229 = svsub_f32_x(svptrue_b32(), v224, v226); + svfloat32_t v230 = svadd_f32_x(svptrue_b32(), v225, v227); + svfloat32_t zero253 = svdup_n_f32(0); + svfloat32_t v253 = svcmla_f32_x(pred_full, zero253, v512, v225, 90); + svfloat32_t v281 = svadd_f32_x(svptrue_b32(), v277, v279); + svfloat32_t v282 = svsub_f32_x(svptrue_b32(), v277, v279); + svfloat32_t v283 = svadd_f32_x(svptrue_b32(), v278, v280); + svfloat32_t zero306 = svdup_n_f32(0); + svfloat32_t v306 = svcmla_f32_x(pred_full, zero306, v512, v278, 90); + svfloat32_t v231 = svadd_f32_x(svptrue_b32(), v228, v214); + svfloat32_t zero260 = svdup_n_f32(0); + svfloat32_t v260 = svcmla_f32_x(pred_full, zero260, v513, v230, 90); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v281, v215); + svfloat32_t zero313 = svdup_n_f32(0); + svfloat32_t v313 = svcmla_f32_x(pred_full, zero313, v513, v283, 90); + svfloat32_t v268 = svmla_f32_x(pred_full, v231, v228, v510); + svfloat32_t v271 = svsub_f32_x(svptrue_b32(), v253, v260); + svfloat32_t v272 = svcmla_f32_x(pred_full, v260, v514, v227, 90); + svfloat32_t v321 = svmla_f32_x(pred_full, v284, v281, v510); + svfloat32_t v324 = svsub_f32_x(svptrue_b32(), v306, v313); + svfloat32_t v325 = svcmla_f32_x(pred_full, v313, v514, v280, 90); + svint16_t v332 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v231, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v340 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v284, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v269 = svmla_f32_x(pred_full, v268, v229, v511); + svfloat32_t v270 = svmls_f32_x(pred_full, v268, v229, v511); + svfloat32_t v322 = svmla_f32_x(pred_full, v321, v282, v511); + svfloat32_t v323 = svmls_f32_x(pred_full, v321, v282, v511); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v522), v604, + svreinterpret_u64_s16(v332)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v531), v604, + svreinterpret_u64_s16(v340)); + svfloat32_t v273 = svadd_f32_x(svptrue_b32(), v269, v271); + svfloat32_t v274 = svsub_f32_x(svptrue_b32(), v269, v271); + svfloat32_t v275 = svadd_f32_x(svptrue_b32(), v270, v272); + svfloat32_t v276 = svsub_f32_x(svptrue_b32(), v270, v272); + svfloat32_t v326 = svadd_f32_x(svptrue_b32(), v322, v324); + svfloat32_t v327 = svsub_f32_x(svptrue_b32(), v322, v324); + svfloat32_t v328 = svadd_f32_x(svptrue_b32(), v323, v325); + svfloat32_t v329 = svsub_f32_x(svptrue_b32(), v323, v325); + svint16_t v348 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v274, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v356 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v327, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v364 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v276, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v372 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v329, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v380 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v275, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v388 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v328, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v396 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v273, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v404 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v326, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v540), v604, + svreinterpret_u64_s16(v348)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v549), v604, + svreinterpret_u64_s16(v356)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v558), v604, + svreinterpret_u64_s16(v364)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v567), v604, + svreinterpret_u64_s16(v372)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v576), v604, + svreinterpret_u64_s16(v380)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v585), v604, + svreinterpret_u64_s16(v388)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v594), v604, + svreinterpret_u64_s16(v396)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v603), v604, + svreinterpret_u64_s16(v404)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs11(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v20 = v5[istride]; + float v373 = 1.1000000000000001e+00F; + float v376 = 3.3166247903554003e-01F; + float v377 = -3.3166247903554003e-01F; + float v384 = 5.1541501300188641e-01F; + float v388 = 9.4125353283118118e-01F; + float v392 = 1.4143537075597825e+00F; + float v396 = 8.5949297361449750e-01F; + float v400 = 4.2314838273285138e-02F; + float v404 = 3.8639279888589606e-01F; + float v408 = 5.1254589567200015e-01F; + float v412 = 1.0702757469471715e+00F; + float v416 = 5.5486073394528512e-01F; + float v419 = 1.2412944743900585e+00F; + float v420 = -1.2412944743900585e+00F; + float v426 = 2.0897833842005756e-01F; + float v427 = -2.0897833842005756e-01F; + float v433 = 3.7415717312460811e-01F; + float v434 = -3.7415717312460811e-01F; + float v440 = 4.9929922194110327e-02F; + float v441 = -4.9929922194110327e-02F; + float v447 = 6.5815896284539266e-01F; + float v448 = -6.5815896284539266e-01F; + float v454 = 6.3306543373877577e-01F; + float v455 = -6.3306543373877577e-01F; + float v461 = 1.0822460581641109e+00F; + float v462 = -1.0822460581641109e+00F; + float v468 = 8.1720737907134022e-01F; + float v469 = -8.1720737907134022e-01F; + float v475 = 4.2408709531871824e-01F; + float v476 = -4.2408709531871824e-01F; + float32x2_t v478 = (float32x2_t){v4, v4}; + float32x2_t v201 = vtrn1_f32(v20, v20); + float32x2_t v202 = vtrn2_f32(v20, v20); + float32x2_t v346 = v5[0]; + float32x2_t v374 = (float32x2_t){v373, v373}; + float32x2_t v378 = (float32x2_t){v376, v377}; + float32x2_t v385 = (float32x2_t){v384, v384}; + float32x2_t v389 = (float32x2_t){v388, v388}; + float32x2_t v393 = (float32x2_t){v392, v392}; + float32x2_t v397 = (float32x2_t){v396, v396}; + float32x2_t v401 = (float32x2_t){v400, v400}; + float32x2_t v405 = (float32x2_t){v404, v404}; + float32x2_t v409 = (float32x2_t){v408, v408}; + float32x2_t v413 = (float32x2_t){v412, v412}; + float32x2_t v417 = (float32x2_t){v416, v416}; + float32x2_t v421 = (float32x2_t){v419, v420}; + float32x2_t v428 = (float32x2_t){v426, v427}; + float32x2_t v435 = (float32x2_t){v433, v434}; + float32x2_t v442 = (float32x2_t){v440, v441}; + float32x2_t v449 = (float32x2_t){v447, v448}; + float32x2_t v456 = (float32x2_t){v454, v455}; + float32x2_t v463 = (float32x2_t){v461, v462}; + float32x2_t v470 = (float32x2_t){v468, v469}; + float32x2_t v477 = (float32x2_t){v475, v476}; + float32x2_t v38 = v5[istride * 10]; + float32x2_t v56 = v5[istride * 2]; + float32x2_t v74 = v5[istride * 9]; + float32x2_t v92 = v5[istride * 3]; + float32x2_t v110 = v5[istride * 8]; + float32x2_t v128 = v5[istride * 4]; + float32x2_t v146 = v5[istride * 7]; + float32x2_t v164 = v5[istride * 5]; + float32x2_t v182 = v5[istride * 6]; + float32x2_t v200 = v7[j * 20]; + int64_t v204 = j * 20 + 1; + int64_t v212 = 18 + j * 20; + int64_t v225 = 2 + j * 20; + int64_t v238 = 16 + j * 20; + int64_t v251 = 4 + j * 20; + int64_t v264 = 14 + j * 20; + int64_t v277 = 6 + j * 20; + int64_t v290 = 12 + j * 20; + int64_t v303 = 8 + j * 20; + int64_t v316 = 10 + j * 20; + float32x2_t v380 = vmul_f32(v478, v378); + float32x2_t v423 = vmul_f32(v478, v421); + float32x2_t v430 = vmul_f32(v478, v428); + float32x2_t v437 = vmul_f32(v478, v435); + float32x2_t v444 = vmul_f32(v478, v442); + float32x2_t v451 = vmul_f32(v478, v449); + float32x2_t v458 = vmul_f32(v478, v456); + float32x2_t v465 = vmul_f32(v478, v463); + float32x2_t v472 = vmul_f32(v478, v470); + float32x2_t v479 = vmul_f32(v478, v477); + float32x2_t v205 = v7[v204]; + float32x2_t v206 = vmul_f32(v201, v200); + float32x2_t v213 = v7[v212]; + float32x2_t v214 = vtrn1_f32(v38, v38); + float32x2_t v215 = vtrn2_f32(v38, v38); + int64_t v217 = v212 + 1; + float32x2_t v226 = v7[v225]; + float32x2_t v227 = vtrn1_f32(v56, v56); + float32x2_t v228 = vtrn2_f32(v56, v56); + int64_t v230 = v225 + 1; + float32x2_t v239 = v7[v238]; + float32x2_t v240 = vtrn1_f32(v74, v74); + float32x2_t v241 = vtrn2_f32(v74, v74); + int64_t v243 = v238 + 1; + float32x2_t v252 = v7[v251]; + float32x2_t v253 = vtrn1_f32(v92, v92); + float32x2_t v254 = vtrn2_f32(v92, v92); + int64_t v256 = v251 + 1; + float32x2_t v265 = v7[v264]; + float32x2_t v266 = vtrn1_f32(v110, v110); + float32x2_t v267 = vtrn2_f32(v110, v110); + int64_t v269 = v264 + 1; + float32x2_t v278 = v7[v277]; + float32x2_t v279 = vtrn1_f32(v128, v128); + float32x2_t v280 = vtrn2_f32(v128, v128); + int64_t v282 = v277 + 1; + float32x2_t v291 = v7[v290]; + float32x2_t v292 = vtrn1_f32(v146, v146); + float32x2_t v293 = vtrn2_f32(v146, v146); + int64_t v295 = v290 + 1; + float32x2_t v304 = v7[v303]; + float32x2_t v305 = vtrn1_f32(v164, v164); + float32x2_t v306 = vtrn2_f32(v164, v164); + int64_t v308 = v303 + 1; + float32x2_t v317 = v7[v316]; + float32x2_t v318 = vtrn1_f32(v182, v182); + float32x2_t v319 = vtrn2_f32(v182, v182); + int64_t v321 = v316 + 1; + float32x2_t v218 = v7[v217]; + float32x2_t v219 = vmul_f32(v214, v213); + float32x2_t v231 = v7[v230]; + float32x2_t v232 = vmul_f32(v227, v226); + float32x2_t v244 = v7[v243]; + float32x2_t v245 = vmul_f32(v240, v239); + float32x2_t v257 = v7[v256]; + float32x2_t v258 = vmul_f32(v253, v252); + float32x2_t v270 = v7[v269]; + float32x2_t v271 = vmul_f32(v266, v265); + float32x2_t v283 = v7[v282]; + float32x2_t v284 = vmul_f32(v279, v278); + float32x2_t v296 = v7[v295]; + float32x2_t v297 = vmul_f32(v292, v291); + float32x2_t v309 = v7[v308]; + float32x2_t v310 = vmul_f32(v305, v304); + float32x2_t v322 = v7[v321]; + float32x2_t v323 = vmul_f32(v318, v317); + float32x2_t v208 = vfma_f32(v206, v202, v205); + float32x2_t v221 = vfma_f32(v219, v215, v218); + float32x2_t v234 = vfma_f32(v232, v228, v231); + float32x2_t v247 = vfma_f32(v245, v241, v244); + float32x2_t v260 = vfma_f32(v258, v254, v257); + float32x2_t v273 = vfma_f32(v271, v267, v270); + float32x2_t v286 = vfma_f32(v284, v280, v283); + float32x2_t v299 = vfma_f32(v297, v293, v296); + float32x2_t v312 = vfma_f32(v310, v306, v309); + float32x2_t v325 = vfma_f32(v323, v319, v322); + float32x2_t v326 = vadd_f32(v208, v221); + float32x2_t v327 = vadd_f32(v234, v247); + float32x2_t v328 = vadd_f32(v260, v273); + float32x2_t v329 = vadd_f32(v286, v299); + float32x2_t v330 = vadd_f32(v312, v325); + float32x2_t v331 = vsub_f32(v208, v221); + float32x2_t v332 = vsub_f32(v234, v247); + float32x2_t v333 = vsub_f32(v260, v273); + float32x2_t v334 = vsub_f32(v286, v299); + float32x2_t v335 = vsub_f32(v312, v325); + float32x2_t v336 = vadd_f32(v326, v327); + float32x2_t v337 = vadd_f32(v328, v330); + float32x2_t v339 = vsub_f32(v332, v333); + float32x2_t v340 = vadd_f32(v331, v335); + float32x2_t v350 = vsub_f32(v327, v329); + float32x2_t v351 = vsub_f32(v326, v329); + float32x2_t v352 = vsub_f32(v327, v326); + float32x2_t v353 = vsub_f32(v330, v329); + float32x2_t v354 = vsub_f32(v328, v329); + float32x2_t v355 = vsub_f32(v330, v328); + float32x2_t v356 = vsub_f32(v327, v330); + float32x2_t v357 = vsub_f32(v326, v328); + float32x2_t v359 = vadd_f32(v332, v334); + float32x2_t v360 = vsub_f32(v331, v334); + float32x2_t v361 = vadd_f32(v331, v332); + float32x2_t v362 = vsub_f32(v334, v335); + float32x2_t v363 = vsub_f32(v333, v334); + float32x2_t v364 = vsub_f32(v333, v335); + float32x2_t v365 = vadd_f32(v332, v335); + float32x2_t v366 = vsub_f32(v331, v333); + float32x2_t v338 = vadd_f32(v329, v336); + float32x2_t v348 = vsub_f32(v339, v340); + float32x2_t v358 = vsub_f32(v337, v336); + float32x2_t v367 = vadd_f32(v339, v340); + float32x2_t v386 = vmul_f32(v350, v385); + float32x2_t v390 = vmul_f32(v351, v389); + float32x2_t v394 = vmul_f32(v352, v393); + float32x2_t v398 = vmul_f32(v353, v397); + float32x2_t v402 = vmul_f32(v354, v401); + float32x2_t v406 = vmul_f32(v355, v405); + float32x2_t v410 = vmul_f32(v356, v409); + float32x2_t v414 = vmul_f32(v357, v413); + float32x2_t v424 = vrev64_f32(v359); + float32x2_t v431 = vrev64_f32(v360); + float32x2_t v438 = vrev64_f32(v361); + float32x2_t v445 = vrev64_f32(v362); + float32x2_t v452 = vrev64_f32(v363); + float32x2_t v459 = vrev64_f32(v364); + float32x2_t v466 = vrev64_f32(v365); + float32x2_t v473 = vrev64_f32(v366); + float32x2_t v341 = vadd_f32(v338, v337); + float32x2_t v349 = vsub_f32(v348, v334); + float32x2_t v418 = vmul_f32(v358, v417); + float32x2_t v425 = vmul_f32(v424, v423); + float32x2_t v432 = vmul_f32(v431, v430); + float32x2_t v439 = vmul_f32(v438, v437); + float32x2_t v446 = vmul_f32(v445, v444); + float32x2_t v453 = vmul_f32(v452, v451); + float32x2_t v460 = vmul_f32(v459, v458); + float32x2_t v467 = vmul_f32(v466, v465); + float32x2_t v474 = vmul_f32(v473, v472); + float32x2_t v480 = vrev64_f32(v367); + float32x2_t v483 = vadd_f32(v386, v390); + float32x2_t v484 = vadd_f32(v390, v394); + float32x2_t v485 = vsub_f32(v386, v394); + float32x2_t v486 = vadd_f32(v398, v402); + float32x2_t v487 = vadd_f32(v402, v406); + float32x2_t v488 = vsub_f32(v398, v406); + float32x2_t v347 = vadd_f32(v346, v341); + float32x2_t v375 = vmul_f32(v341, v374); + float32x2_t v381 = vrev64_f32(v349); + float32x2_t v481 = vmul_f32(v480, v479); + float32x2_t v489 = vadd_f32(v414, v418); + float32x2_t v490 = vadd_f32(v410, v418); + float32x2_t v491 = vadd_f32(v432, v439); + float32x2_t v492 = vsub_f32(v425, v439); + float32x2_t v493 = vadd_f32(v453, v460); + float32x2_t v494 = vsub_f32(v446, v460); + float32x2_t v382 = vmul_f32(v381, v380); + float32x2_t v482 = vsub_f32(v347, v375); + float32x2_t v495 = vadd_f32(v474, v481); + float32x2_t v496 = vsub_f32(v467, v481); + float32x2_t v497 = vadd_f32(v487, v489); + float32x2_t v515 = vadd_f32(v491, v492); + int16x4_t v531 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v347, 15), (int32x2_t){0, 0})); + float32x2_t v498 = vadd_f32(v497, v482); + float32x2_t v499 = vsub_f32(v482, v484); + float32x2_t v501 = vadd_f32(v482, v488); + float32x2_t v503 = vsub_f32(v482, v485); + float32x2_t v505 = vadd_f32(v482, v483); + float32x2_t v507 = vadd_f32(v382, v493); + float32x2_t v509 = vsub_f32(v495, v491); + float32x2_t v511 = vadd_f32(v382, v496); + float32x2_t v513 = vsub_f32(v496, v492); + float32x2_t v516 = vadd_f32(v515, v493); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v531), 0); + float32x2_t v500 = vsub_f32(v499, v489); + float32x2_t v502 = vadd_f32(v501, v490); + float32x2_t v504 = vsub_f32(v503, v490); + float32x2_t v506 = vsub_f32(v505, v486); + float32x2_t v508 = vadd_f32(v507, v495); + float32x2_t v510 = vsub_f32(v509, v382); + float32x2_t v512 = vadd_f32(v511, v494); + float32x2_t v514 = vsub_f32(v513, v382); + float32x2_t v517 = vadd_f32(v516, v494); + float32x2_t v518 = vsub_f32(v517, v382); + float32x2_t v520 = vadd_f32(v498, v508); + float32x2_t v521 = vadd_f32(v500, v510); + float32x2_t v522 = vsub_f32(v502, v512); + float32x2_t v523 = vadd_f32(v504, v514); + float32x2_t v524 = vsub_f32(v504, v514); + float32x2_t v525 = vadd_f32(v502, v512); + float32x2_t v526 = vsub_f32(v500, v510); + float32x2_t v527 = vsub_f32(v498, v508); + float32x2_t v519 = vadd_f32(v506, v518); + float32x2_t v528 = vsub_f32(v506, v518); + int16x4_t v543 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v520, 15), (int32x2_t){0, 0})); + int16x4_t v549 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v521, 15), (int32x2_t){0, 0})); + int16x4_t v555 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v522, 15), (int32x2_t){0, 0})); + int16x4_t v561 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v523, 15), (int32x2_t){0, 0})); + int16x4_t v567 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v524, 15), (int32x2_t){0, 0})); + int16x4_t v573 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v525, 15), (int32x2_t){0, 0})); + int16x4_t v579 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v526, 15), (int32x2_t){0, 0})); + int16x4_t v585 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v527, 15), (int32x2_t){0, 0})); + int16x4_t v537 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v519, 15), (int32x2_t){0, 0})); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v543), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v549), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v555), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v561), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v567), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v573), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v579), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v585), 0); + int16x4_t v591 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v528, 15), (int32x2_t){0, 0})); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v537), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v591), 0); + v5 += 1 * idist; + v6 += 1 * odist; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs11(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + int64_t v3 = odist; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * v3; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + int64_t v13 = j; + float v278 = 1.1000000000000001e+00F; + float v283 = -3.3166247903554003e-01F; + float v290 = 5.1541501300188641e-01F; + float v295 = 9.4125353283118118e-01F; + float v300 = 1.4143537075597825e+00F; + float v305 = 8.5949297361449750e-01F; + float v310 = 4.2314838273285138e-02F; + float v315 = 3.8639279888589606e-01F; + float v320 = 5.1254589567200015e-01F; + float v325 = 1.0702757469471715e+00F; + float v330 = 5.5486073394528512e-01F; + float v335 = -1.2412944743900585e+00F; + float v342 = -2.0897833842005756e-01F; + float v349 = -3.7415717312460811e-01F; + float v356 = -4.9929922194110327e-02F; + float v363 = -6.5815896284539266e-01F; + float v370 = -6.3306543373877577e-01F; + float v377 = -1.0822460581641109e+00F; + float v384 = -8.1720737907134022e-01F; + float v391 = -4.2408709531871824e-01F; + const float32x2_t *v538 = &v5[v0]; + int32_t *v752 = &v6[v2]; + int64_t v33 = v0 * 10; + int64_t v47 = v0 * 2; + int64_t v61 = v0 * 9; + int64_t v75 = v0 * 3; + int64_t v89 = v0 * 8; + int64_t v103 = v0 * 4; + int64_t v117 = v0 * 7; + int64_t v131 = v0 * 5; + int64_t v145 = v0 * 6; + int64_t v167 = v10 * 9; + int64_t v181 = v10 * 8; + int64_t v188 = v10 * 2; + int64_t v195 = v10 * 7; + int64_t v202 = v10 * 3; + int64_t v209 = v10 * 6; + int64_t v216 = v10 * 4; + int64_t v223 = v10 * 5; + int64_t v224 = v13 * 10; + float v286 = v4 * v283; + float v338 = v4 * v335; + float v345 = v4 * v342; + float v352 = v4 * v349; + float v359 = v4 * v356; + float v366 = v4 * v363; + float v373 = v4 * v370; + float v380 = v4 * v377; + float v387 = v4 * v384; + float v394 = v4 * v391; + int64_t v453 = v2 * 10; + int64_t v461 = v2 * 9; + int64_t v469 = v2 * 8; + int64_t v477 = v2 * 7; + int64_t v485 = v2 * 6; + int64_t v493 = v2 * 5; + int64_t v501 = v2 * 4; + int64_t v509 = v2 * 3; + int64_t v517 = v2 * 2; + const float32x2_t *v631 = &v5[0]; + svint64_t v632 = svindex_s64(0, v1); + svfloat32_t v635 = svdup_n_f32(v278); + svfloat32_t v637 = svdup_n_f32(v290); + svfloat32_t v638 = svdup_n_f32(v295); + svfloat32_t v639 = svdup_n_f32(v300); + svfloat32_t v640 = svdup_n_f32(v305); + svfloat32_t v641 = svdup_n_f32(v310); + svfloat32_t v642 = svdup_n_f32(v315); + svfloat32_t v643 = svdup_n_f32(v320); + svfloat32_t v644 = svdup_n_f32(v325); + svfloat32_t v645 = svdup_n_f32(v330); + int32_t *v662 = &v6[0]; + svint64_t v753 = svindex_s64(0, v3); + svfloat32_t v163 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v224])); + int64_t v169 = v167 + v224; + int64_t v176 = v10 + v224; + int64_t v183 = v181 + v224; + int64_t v190 = v188 + v224; + int64_t v197 = v195 + v224; + int64_t v204 = v202 + v224; + int64_t v211 = v209 + v224; + int64_t v218 = v216 + v224; + int64_t v225 = v223 + v224; + svfloat32_t v540 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v538), v632)); + const float32x2_t *v548 = &v5[v33]; + const float32x2_t *v557 = &v5[v47]; + const float32x2_t *v566 = &v5[v61]; + const float32x2_t *v575 = &v5[v75]; + const float32x2_t *v584 = &v5[v89]; + const float32x2_t *v593 = &v5[v103]; + const float32x2_t *v602 = &v5[v117]; + const float32x2_t *v611 = &v5[v131]; + const float32x2_t *v620 = &v5[v145]; + svfloat32_t v633 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v631), v632)); + svfloat32_t v636 = svdup_n_f32(v286); + svfloat32_t v646 = svdup_n_f32(v338); + svfloat32_t v647 = svdup_n_f32(v345); + svfloat32_t v648 = svdup_n_f32(v352); + svfloat32_t v649 = svdup_n_f32(v359); + svfloat32_t v650 = svdup_n_f32(v366); + svfloat32_t v651 = svdup_n_f32(v373); + svfloat32_t v652 = svdup_n_f32(v380); + svfloat32_t v653 = svdup_n_f32(v387); + svfloat32_t v654 = svdup_n_f32(v394); + int32_t *v671 = &v6[v453]; + int32_t *v680 = &v6[v461]; + int32_t *v689 = &v6[v469]; + int32_t *v698 = &v6[v477]; + int32_t *v707 = &v6[v485]; + int32_t *v716 = &v6[v493]; + int32_t *v725 = &v6[v501]; + int32_t *v734 = &v6[v509]; + int32_t *v743 = &v6[v517]; + svfloat32_t zero164 = svdup_n_f32(0); + svfloat32_t v164 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero164, v540, v163, 0), + v540, v163, 90); + svfloat32_t v170 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v169])); + svfloat32_t v177 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v176])); + svfloat32_t v184 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v183])); + svfloat32_t v191 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v190])); + svfloat32_t v198 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v197])); + svfloat32_t v205 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v204])); + svfloat32_t v212 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v211])); + svfloat32_t v219 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v218])); + svfloat32_t v226 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v225])); + svfloat32_t v550 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v548), v632)); + svfloat32_t v559 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v557), v632)); + svfloat32_t v568 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v566), v632)); + svfloat32_t v577 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v575), v632)); + svfloat32_t v586 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v584), v632)); + svfloat32_t v595 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v593), v632)); + svfloat32_t v604 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v602), v632)); + svfloat32_t v613 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v611), v632)); + svfloat32_t v622 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v620), v632)); + svfloat32_t zero171 = svdup_n_f32(0); + svfloat32_t v171 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero171, v550, v170, 0), + v550, v170, 90); + svfloat32_t zero178 = svdup_n_f32(0); + svfloat32_t v178 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero178, v559, v177, 0), + v559, v177, 90); + svfloat32_t zero185 = svdup_n_f32(0); + svfloat32_t v185 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero185, v568, v184, 0), + v568, v184, 90); + svfloat32_t zero192 = svdup_n_f32(0); + svfloat32_t v192 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero192, v577, v191, 0), + v577, v191, 90); + svfloat32_t zero199 = svdup_n_f32(0); + svfloat32_t v199 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero199, v586, v198, 0), + v586, v198, 90); + svfloat32_t zero206 = svdup_n_f32(0); + svfloat32_t v206 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v595, v205, 0), + v595, v205, 90); + svfloat32_t zero213 = svdup_n_f32(0); + svfloat32_t v213 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero213, v604, v212, 0), + v604, v212, 90); + svfloat32_t zero220 = svdup_n_f32(0); + svfloat32_t v220 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero220, v613, v219, 0), + v613, v219, 90); + svfloat32_t zero227 = svdup_n_f32(0); + svfloat32_t v227 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero227, v622, v226, 0), + v622, v226, 90); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v164, v171); + svfloat32_t v229 = svadd_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v230 = svadd_f32_x(svptrue_b32(), v192, v199); + svfloat32_t v231 = svadd_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v232 = svadd_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v233 = svsub_f32_x(svptrue_b32(), v164, v171); + svfloat32_t v234 = svsub_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v192, v199); + svfloat32_t v236 = svsub_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v237 = svsub_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v238 = svadd_f32_x(svptrue_b32(), v228, v229); + svfloat32_t v239 = svadd_f32_x(svptrue_b32(), v230, v232); + svfloat32_t v241 = svsub_f32_x(svptrue_b32(), v234, v235); + svfloat32_t v242 = svadd_f32_x(svptrue_b32(), v233, v237); + svfloat32_t v254 = svsub_f32_x(svptrue_b32(), v229, v231); + svfloat32_t v255 = svsub_f32_x(svptrue_b32(), v228, v231); + svfloat32_t v256 = svsub_f32_x(svptrue_b32(), v229, v228); + svfloat32_t v257 = svsub_f32_x(svptrue_b32(), v232, v231); + svfloat32_t v258 = svsub_f32_x(svptrue_b32(), v230, v231); + svfloat32_t v259 = svsub_f32_x(svptrue_b32(), v232, v230); + svfloat32_t v260 = svsub_f32_x(svptrue_b32(), v229, v232); + svfloat32_t v261 = svsub_f32_x(svptrue_b32(), v228, v230); + svfloat32_t v263 = svadd_f32_x(svptrue_b32(), v234, v236); + svfloat32_t v264 = svsub_f32_x(svptrue_b32(), v233, v236); + svfloat32_t v265 = svadd_f32_x(svptrue_b32(), v233, v234); + svfloat32_t v266 = svsub_f32_x(svptrue_b32(), v236, v237); + svfloat32_t v267 = svsub_f32_x(svptrue_b32(), v235, v236); + svfloat32_t v268 = svsub_f32_x(svptrue_b32(), v235, v237); + svfloat32_t v269 = svadd_f32_x(svptrue_b32(), v234, v237); + svfloat32_t v270 = svsub_f32_x(svptrue_b32(), v233, v235); + svfloat32_t v240 = svadd_f32_x(svptrue_b32(), v231, v238); + svfloat32_t v252 = svsub_f32_x(svptrue_b32(), v241, v242); + svfloat32_t v262 = svsub_f32_x(svptrue_b32(), v239, v238); + svfloat32_t v271 = svadd_f32_x(svptrue_b32(), v241, v242); + svfloat32_t v298 = svmul_f32_x(svptrue_b32(), v255, v638); + svfloat32_t v303 = svmul_f32_x(svptrue_b32(), v256, v639); + svfloat32_t v313 = svmul_f32_x(svptrue_b32(), v258, v641); + svfloat32_t v318 = svmul_f32_x(svptrue_b32(), v259, v642); + svfloat32_t zero340 = svdup_n_f32(0); + svfloat32_t v340 = svcmla_f32_x(pred_full, zero340, v646, v263, 90); + svfloat32_t zero354 = svdup_n_f32(0); + svfloat32_t v354 = svcmla_f32_x(pred_full, zero354, v648, v265, 90); + svfloat32_t zero361 = svdup_n_f32(0); + svfloat32_t v361 = svcmla_f32_x(pred_full, zero361, v649, v266, 90); + svfloat32_t zero375 = svdup_n_f32(0); + svfloat32_t v375 = svcmla_f32_x(pred_full, zero375, v651, v268, 90); + svfloat32_t zero382 = svdup_n_f32(0); + svfloat32_t v382 = svcmla_f32_x(pred_full, zero382, v652, v269, 90); + svfloat32_t v243 = svadd_f32_x(svptrue_b32(), v240, v239); + svfloat32_t v253 = svsub_f32_x(svptrue_b32(), v252, v236); + svfloat32_t v333 = svmul_f32_x(svptrue_b32(), v262, v645); + svfloat32_t zero396 = svdup_n_f32(0); + svfloat32_t v396 = svcmla_f32_x(pred_full, zero396, v654, v271, 90); + svfloat32_t v398 = svmla_f32_x(pred_full, v298, v254, v637); + svfloat32_t v399 = svmla_f32_x(pred_full, v303, v255, v638); + svfloat32_t v400 = svnmls_f32_x(pred_full, v303, v254, v637); + svfloat32_t v401 = svmla_f32_x(pred_full, v313, v257, v640); + svfloat32_t v402 = svmla_f32_x(pred_full, v318, v258, v641); + svfloat32_t v403 = svnmls_f32_x(pred_full, v318, v257, v640); + svfloat32_t v406 = svcmla_f32_x(pred_full, v354, v647, v264, 90); + svfloat32_t v407 = svsub_f32_x(svptrue_b32(), v340, v354); + svfloat32_t v408 = svcmla_f32_x(pred_full, v375, v650, v267, 90); + svfloat32_t v409 = svsub_f32_x(svptrue_b32(), v361, v375); + svfloat32_t v251 = svadd_f32_x(svptrue_b32(), v633, v243); + svfloat32_t zero288 = svdup_n_f32(0); + svfloat32_t v288 = svcmla_f32_x(pred_full, zero288, v636, v253, 90); + svfloat32_t v404 = svmla_f32_x(pred_full, v333, v261, v644); + svfloat32_t v405 = svmla_f32_x(pred_full, v333, v260, v643); + svfloat32_t v410 = svcmla_f32_x(pred_full, v396, v653, v270, 90); + svfloat32_t v411 = svsub_f32_x(svptrue_b32(), v382, v396); + svfloat32_t v430 = svadd_f32_x(svptrue_b32(), v406, v407); + svfloat32_t v397 = svmls_f32_x(pred_full, v251, v243, v635); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v402, v404); + svfloat32_t v422 = svadd_f32_x(svptrue_b32(), v288, v408); + svfloat32_t v424 = svsub_f32_x(svptrue_b32(), v410, v406); + svfloat32_t v426 = svadd_f32_x(svptrue_b32(), v288, v411); + svfloat32_t v428 = svsub_f32_x(svptrue_b32(), v411, v407); + svfloat32_t v431 = svadd_f32_x(svptrue_b32(), v430, v408); + svint16_t v446 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v251, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v413 = svadd_f32_x(svptrue_b32(), v412, v397); + svfloat32_t v414 = svsub_f32_x(svptrue_b32(), v397, v399); + svfloat32_t v416 = svadd_f32_x(svptrue_b32(), v397, v403); + svfloat32_t v418 = svsub_f32_x(svptrue_b32(), v397, v400); + svfloat32_t v420 = svadd_f32_x(svptrue_b32(), v397, v398); + svfloat32_t v423 = svadd_f32_x(svptrue_b32(), v422, v410); + svfloat32_t v425 = svsub_f32_x(svptrue_b32(), v424, v288); + svfloat32_t v427 = svadd_f32_x(svptrue_b32(), v426, v409); + svfloat32_t v429 = svsub_f32_x(svptrue_b32(), v428, v288); + svfloat32_t v432 = svadd_f32_x(svptrue_b32(), v431, v409); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v662), v753, + svreinterpret_u64_s16(v446)); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v414, v404); + svfloat32_t v417 = svadd_f32_x(svptrue_b32(), v416, v405); + svfloat32_t v419 = svsub_f32_x(svptrue_b32(), v418, v405); + svfloat32_t v421 = svsub_f32_x(svptrue_b32(), v420, v401); + svfloat32_t v433 = svsub_f32_x(svptrue_b32(), v432, v288); + svfloat32_t v435 = svadd_f32_x(svptrue_b32(), v413, v423); + svfloat32_t v442 = svsub_f32_x(svptrue_b32(), v413, v423); + svfloat32_t v434 = svadd_f32_x(svptrue_b32(), v421, v433); + svfloat32_t v436 = svadd_f32_x(svptrue_b32(), v415, v425); + svfloat32_t v437 = svsub_f32_x(svptrue_b32(), v417, v427); + svfloat32_t v438 = svadd_f32_x(svptrue_b32(), v419, v429); + svfloat32_t v439 = svsub_f32_x(svptrue_b32(), v419, v429); + svfloat32_t v440 = svadd_f32_x(svptrue_b32(), v417, v427); + svfloat32_t v441 = svsub_f32_x(svptrue_b32(), v415, v425); + svfloat32_t v443 = svsub_f32_x(svptrue_b32(), v421, v433); + svint16_t v462 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v435, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v518 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v442, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v454 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v434, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v470 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v436, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v478 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v437, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v486 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v438, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v494 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v439, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v502 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v440, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v510 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v441, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v526 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v443, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v680), v753, + svreinterpret_u64_s16(v462)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v743), v753, + svreinterpret_u64_s16(v518)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v671), v753, + svreinterpret_u64_s16(v454)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v689), v753, + svreinterpret_u64_s16(v470)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v698), v753, + svreinterpret_u64_s16(v478)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v707), v753, + svreinterpret_u64_s16(v486)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v716), v753, + svreinterpret_u64_s16(v494)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v725), v753, + svreinterpret_u64_s16(v502)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v734), v753, + svreinterpret_u64_s16(v510)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v752), v753, + svreinterpret_u64_s16(v526)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs12(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v242 = v5[istride]; + float v353 = 1.0000000000000000e+00F; + float v354 = -1.0000000000000000e+00F; + float v380 = -1.4999999999999998e+00F; + float v381 = 1.4999999999999998e+00F; + float v409 = 8.6602540378443871e-01F; + float32x2_t v412 = (float32x2_t){v4, v4}; + float v417 = -8.6602540378443871e-01F; + float32x2_t v279 = vtrn1_f32(v242, v242); + float32x2_t v280 = vtrn2_f32(v242, v242); + float32x2_t v324 = v5[0]; + float32x2_t v355 = (float32x2_t){v353, v354}; + float32x2_t v378 = (float32x2_t){v380, v380}; + float32x2_t v382 = (float32x2_t){v380, v381}; + float32x2_t v411 = (float32x2_t){v409, v417}; + float32x2_t v418 = (float32x2_t){v417, v417}; + float32x2_t v20 = v5[istride * 4]; + float32x2_t v38 = v5[istride * 8]; + int64_t v55 = 6 + j * 22; + int64_t v68 = 14 + j * 22; + float32x2_t v82 = v5[istride * 7]; + float32x2_t v100 = v5[istride * 11]; + int64_t v117 = 12 + j * 22; + int64_t v130 = 20 + j * 22; + float32x2_t v144 = v5[istride * 3]; + int64_t v148 = 4 + j * 22; + float32x2_t v162 = v5[istride * 10]; + float32x2_t v180 = v5[istride * 2]; + int64_t v197 = 18 + j * 22; + int64_t v210 = 2 + j * 22; + float32x2_t v224 = v5[istride * 6]; + int64_t v228 = 10 + j * 22; + float32x2_t v260 = v5[istride * 5]; + float32x2_t v278 = v7[j * 22]; + int64_t v282 = j * 22 + 1; + int64_t v290 = 8 + j * 22; + float32x2_t v304 = v5[istride * 9]; + int64_t v308 = 16 + j * 22; + float32x2_t v357 = vmul_f32(v412, v355); + float32x2_t v384 = vmul_f32(v412, v382); + float32x2_t v413 = vmul_f32(v412, v411); + float32x2_t v56 = v7[v55]; + float32x2_t v57 = vtrn1_f32(v20, v20); + float32x2_t v58 = vtrn2_f32(v20, v20); + int64_t v60 = v55 + 1; + float32x2_t v69 = v7[v68]; + float32x2_t v70 = vtrn1_f32(v38, v38); + float32x2_t v71 = vtrn2_f32(v38, v38); + int64_t v73 = v68 + 1; + float32x2_t v118 = v7[v117]; + float32x2_t v119 = vtrn1_f32(v82, v82); + float32x2_t v120 = vtrn2_f32(v82, v82); + int64_t v122 = v117 + 1; + float32x2_t v131 = v7[v130]; + float32x2_t v132 = vtrn1_f32(v100, v100); + float32x2_t v133 = vtrn2_f32(v100, v100); + int64_t v135 = v130 + 1; + float32x2_t v149 = v7[v148]; + float32x2_t v150 = vtrn1_f32(v144, v144); + float32x2_t v151 = vtrn2_f32(v144, v144); + int64_t v153 = v148 + 1; + float32x2_t v198 = v7[v197]; + float32x2_t v199 = vtrn1_f32(v162, v162); + float32x2_t v200 = vtrn2_f32(v162, v162); + int64_t v202 = v197 + 1; + float32x2_t v211 = v7[v210]; + float32x2_t v212 = vtrn1_f32(v180, v180); + float32x2_t v213 = vtrn2_f32(v180, v180); + int64_t v215 = v210 + 1; + float32x2_t v229 = v7[v228]; + float32x2_t v230 = vtrn1_f32(v224, v224); + float32x2_t v231 = vtrn2_f32(v224, v224); + int64_t v233 = v228 + 1; + float32x2_t v283 = v7[v282]; + float32x2_t v284 = vmul_f32(v279, v278); + float32x2_t v291 = v7[v290]; + float32x2_t v292 = vtrn1_f32(v260, v260); + float32x2_t v293 = vtrn2_f32(v260, v260); + int64_t v295 = v290 + 1; + float32x2_t v309 = v7[v308]; + float32x2_t v310 = vtrn1_f32(v304, v304); + float32x2_t v311 = vtrn2_f32(v304, v304); + int64_t v313 = v308 + 1; + float32x2_t v61 = v7[v60]; + float32x2_t v62 = vmul_f32(v57, v56); + float32x2_t v74 = v7[v73]; + float32x2_t v75 = vmul_f32(v70, v69); + float32x2_t v123 = v7[v122]; + float32x2_t v124 = vmul_f32(v119, v118); + float32x2_t v136 = v7[v135]; + float32x2_t v137 = vmul_f32(v132, v131); + float32x2_t v154 = v7[v153]; + float32x2_t v155 = vmul_f32(v150, v149); + float32x2_t v203 = v7[v202]; + float32x2_t v204 = vmul_f32(v199, v198); + float32x2_t v216 = v7[v215]; + float32x2_t v217 = vmul_f32(v212, v211); + float32x2_t v234 = v7[v233]; + float32x2_t v235 = vmul_f32(v230, v229); + float32x2_t v296 = v7[v295]; + float32x2_t v297 = vmul_f32(v292, v291); + float32x2_t v314 = v7[v313]; + float32x2_t v315 = vmul_f32(v310, v309); + float32x2_t v286 = vfma_f32(v284, v280, v283); + float32x2_t v64 = vfma_f32(v62, v58, v61); + float32x2_t v77 = vfma_f32(v75, v71, v74); + float32x2_t v126 = vfma_f32(v124, v120, v123); + float32x2_t v139 = vfma_f32(v137, v133, v136); + float32x2_t v157 = vfma_f32(v155, v151, v154); + float32x2_t v206 = vfma_f32(v204, v200, v203); + float32x2_t v219 = vfma_f32(v217, v213, v216); + float32x2_t v237 = vfma_f32(v235, v231, v234); + float32x2_t v299 = vfma_f32(v297, v293, v296); + float32x2_t v317 = vfma_f32(v315, v311, v314); + float32x2_t v318 = vadd_f32(v64, v77); + float32x2_t v319 = vsub_f32(v64, v77); + float32x2_t v326 = vadd_f32(v126, v139); + float32x2_t v327 = vsub_f32(v126, v139); + float32x2_t v329 = vadd_f32(v206, v219); + float32x2_t v330 = vsub_f32(v206, v219); + float32x2_t v332 = vadd_f32(v286, v299); + float32x2_t v333 = vsub_f32(v286, v299); + float32x2_t v325 = vadd_f32(v318, v324); + float32x2_t v328 = vadd_f32(v326, v157); + float32x2_t v331 = vadd_f32(v329, v237); + float32x2_t v334 = vadd_f32(v332, v317); + float32x2_t v362 = vadd_f32(v318, v329); + float32x2_t v363 = vsub_f32(v318, v329); + float32x2_t v364 = vadd_f32(v326, v332); + float32x2_t v365 = vsub_f32(v326, v332); + float32x2_t v389 = vadd_f32(v319, v330); + float32x2_t v390 = vsub_f32(v319, v330); + float32x2_t v391 = vadd_f32(v327, v333); + float32x2_t v392 = vsub_f32(v327, v333); + float32x2_t v335 = vadd_f32(v325, v331); + float32x2_t v336 = vsub_f32(v325, v331); + float32x2_t v337 = vadd_f32(v328, v334); + float32x2_t v338 = vsub_f32(v328, v334); + float32x2_t v366 = vadd_f32(v362, v364); + float32x2_t v367 = vsub_f32(v362, v364); + float32x2_t v379 = vmul_f32(v363, v378); + float32x2_t v385 = vrev64_f32(v365); + float32x2_t v393 = vadd_f32(v389, v391); + float32x2_t v394 = vsub_f32(v389, v391); + float32x2_t v414 = vrev64_f32(v390); + float32x2_t v419 = vmul_f32(v392, v418); + float32x2_t v339 = vadd_f32(v335, v337); + float32x2_t v340 = vsub_f32(v335, v337); + float32x2_t v358 = vrev64_f32(v338); + float32x2_t v371 = vmul_f32(v366, v378); + float32x2_t v375 = vmul_f32(v367, v378); + float32x2_t v386 = vmul_f32(v385, v384); + float32x2_t v400 = vrev64_f32(v393); + float32x2_t v407 = vrev64_f32(v394); + float32x2_t v415 = vmul_f32(v414, v413); + float32x2_t v359 = vmul_f32(v358, v357); + float32x2_t v387 = vadd_f32(v379, v386); + float32x2_t v388 = vsub_f32(v379, v386); + float32x2_t v401 = vmul_f32(v400, v413); + float32x2_t v408 = vmul_f32(v407, v413); + float32x2_t v420 = vadd_f32(v415, v419); + float32x2_t v421 = vsub_f32(v415, v419); + float32x2_t v422 = vadd_f32(v339, v371); + int16x4_t v427 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v339, 15), (int32x2_t){0, 0})); + float32x2_t v464 = vadd_f32(v340, v375); + int16x4_t v469 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v340, 15), (int32x2_t){0, 0})); + float32x2_t v360 = vadd_f32(v336, v359); + float32x2_t v361 = vsub_f32(v336, v359); + float32x2_t v423 = vadd_f32(v422, v401); + float32x2_t v424 = vsub_f32(v422, v401); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v427), 0); + float32x2_t v465 = vadd_f32(v464, v408); + float32x2_t v466 = vsub_f32(v464, v408); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v469), 0); + int16x4_t v433 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v424, 15), (int32x2_t){0, 0})); + int16x4_t v439 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v423, 15), (int32x2_t){0, 0})); + float32x2_t v443 = vadd_f32(v361, v388); + int16x4_t v448 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v361, 15), (int32x2_t){0, 0})); + int16x4_t v475 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v466, 15), (int32x2_t){0, 0})); + int16x4_t v481 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v465, 15), (int32x2_t){0, 0})); + float32x2_t v485 = vadd_f32(v360, v387); + int16x4_t v490 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v360, 15), (int32x2_t){0, 0})); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v433), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v439), 0); + float32x2_t v444 = vadd_f32(v443, v421); + float32x2_t v445 = vsub_f32(v443, v421); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v448), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v475), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v481), 0); + float32x2_t v486 = vadd_f32(v485, v420); + float32x2_t v487 = vsub_f32(v485, v420); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v490), 0); + int16x4_t v454 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v445, 15), (int32x2_t){0, 0})); + int16x4_t v460 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v444, 15), (int32x2_t){0, 0})); + int16x4_t v496 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v487, 15), (int32x2_t){0, 0})); + int16x4_t v502 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v486, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v454), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v460), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v496), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v502), 0); + v5 += 1 * idist; + v6 += 1 * odist; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs12(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + int64_t v3 = odist; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * v3; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + int64_t v13 = j; + float v269 = -1.0000000000000000e+00F; + float v294 = -1.4999999999999998e+00F; + float v299 = 1.4999999999999998e+00F; + float v335 = -8.6602540378443871e-01F; + const float32x2_t *v527 = &v5[v0]; + int32_t *v615 = &v6[v2]; + int64_t v19 = v0 * 4; + int64_t v33 = v0 * 8; + int64_t v48 = v10 * 3; + int64_t v55 = v10 * 7; + int64_t v61 = v0 * 7; + int64_t v75 = v0 * 11; + int64_t v90 = v10 * 6; + int64_t v97 = v10 * 10; + int64_t v103 = v0 * 3; + int64_t v111 = v10 * 2; + int64_t v117 = v0 * 10; + int64_t v131 = v0 * 2; + int64_t v146 = v10 * 9; + int64_t v159 = v0 * 6; + int64_t v167 = v10 * 5; + int64_t v187 = v0 * 5; + int64_t v209 = v10 * 4; + int64_t v215 = v0 * 9; + int64_t v223 = v10 * 8; + int64_t v224 = v13 * 11; + float v272 = v4 * v269; + float v302 = v4 * v299; + float v331 = v4 * v335; + int64_t v353 = v2 * 4; + int64_t v361 = v2 * 8; + int64_t v372 = v2 * 9; + int64_t v388 = v2 * 5; + int64_t v399 = v2 * 6; + int64_t v407 = v2 * 10; + int64_t v415 = v2 * 2; + int64_t v426 = v2 * 3; + int64_t v434 = v2 * 7; + int64_t v442 = v2 * 11; + const float32x2_t *v557 = &v5[0]; + svint64_t v558 = svindex_s64(0, v1); + svfloat32_t v566 = svdup_n_f32(v294); + svfloat32_t v571 = svdup_n_f32(v335); + int32_t *v579 = &v6[0]; + svint64_t v679 = svindex_s64(0, v3); + int64_t v50 = v48 + v224; + int64_t v57 = v55 + v224; + int64_t v92 = v90 + v224; + int64_t v99 = v97 + v224; + int64_t v113 = v111 + v224; + int64_t v148 = v146 + v224; + int64_t v155 = v10 + v224; + int64_t v169 = v167 + v224; + svfloat32_t v205 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v224])); + int64_t v211 = v209 + v224; + int64_t v225 = v223 + v224; + const float32x2_t *v455 = &v5[v19]; + const float32x2_t *v464 = &v5[v33]; + const float32x2_t *v473 = &v5[v61]; + const float32x2_t *v482 = &v5[v75]; + const float32x2_t *v491 = &v5[v103]; + const float32x2_t *v500 = &v5[v117]; + const float32x2_t *v509 = &v5[v131]; + const float32x2_t *v518 = &v5[v159]; + svfloat32_t v529 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v527), v558)); + const float32x2_t *v537 = &v5[v187]; + const float32x2_t *v547 = &v5[v215]; + svfloat32_t v559 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v557), v558)); + svfloat32_t v563 = svdup_n_f32(v272); + svfloat32_t v567 = svdup_n_f32(v302); + svfloat32_t v570 = svdup_n_f32(v331); + int32_t *v588 = &v6[v353]; + int32_t *v597 = &v6[v361]; + int32_t *v606 = &v6[v372]; + int32_t *v624 = &v6[v388]; + int32_t *v633 = &v6[v399]; + int32_t *v642 = &v6[v407]; + int32_t *v651 = &v6[v415]; + int32_t *v660 = &v6[v426]; + int32_t *v669 = &v6[v434]; + int32_t *v678 = &v6[v442]; + svfloat32_t v51 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v50])); + svfloat32_t v58 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v57])); + svfloat32_t v93 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v92])); + svfloat32_t v100 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v99])); + svfloat32_t v114 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v113])); + svfloat32_t v149 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v148])); + svfloat32_t v156 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v155])); + svfloat32_t v170 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v169])); + svfloat32_t zero206 = svdup_n_f32(0); + svfloat32_t v206 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v529, v205, 0), + v529, v205, 90); + svfloat32_t v212 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v211])); + svfloat32_t v226 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v225])); + svfloat32_t v457 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v455), v558)); + svfloat32_t v466 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v464), v558)); + svfloat32_t v475 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v473), v558)); + svfloat32_t v484 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v482), v558)); + svfloat32_t v493 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v491), v558)); + svfloat32_t v502 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v500), v558)); + svfloat32_t v511 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v509), v558)); + svfloat32_t v520 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v518), v558)); + svfloat32_t v539 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v537), v558)); + svfloat32_t v549 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v547), v558)); + svfloat32_t zero52 = svdup_n_f32(0); + svfloat32_t v52 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v457, v51, 0), + v457, v51, 90); + svfloat32_t zero59 = svdup_n_f32(0); + svfloat32_t v59 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v466, v58, 0), + v466, v58, 90); + svfloat32_t zero94 = svdup_n_f32(0); + svfloat32_t v94 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v475, v93, 0), + v475, v93, 90); + svfloat32_t zero101 = svdup_n_f32(0); + svfloat32_t v101 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v484, v100, 0), + v484, v100, 90); + svfloat32_t zero150 = svdup_n_f32(0); + svfloat32_t v150 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero150, v502, v149, 0), + v502, v149, 90); + svfloat32_t zero157 = svdup_n_f32(0); + svfloat32_t v157 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v511, v156, 0), + v511, v156, 90); + svfloat32_t zero213 = svdup_n_f32(0); + svfloat32_t v213 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero213, v539, v212, 0), + v539, v212, 90); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v229 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v238 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v239 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v241 = svadd_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v242 = svsub_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v244 = svadd_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v245 = svsub_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v237 = svadd_f32_x(svptrue_b32(), v228, v559); + svfloat32_t v240 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v238, v493, v114, 0), + v493, v114, 90); + svfloat32_t v243 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v241, v520, v170, 0), + v520, v170, 90); + svfloat32_t v246 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v244, v549, v226, 0), + v549, v226, 90); + svfloat32_t v277 = svadd_f32_x(svptrue_b32(), v228, v241); + svfloat32_t v278 = svsub_f32_x(svptrue_b32(), v228, v241); + svfloat32_t v279 = svadd_f32_x(svptrue_b32(), v238, v244); + svfloat32_t v280 = svsub_f32_x(svptrue_b32(), v238, v244); + svfloat32_t v307 = svadd_f32_x(svptrue_b32(), v229, v242); + svfloat32_t v308 = svsub_f32_x(svptrue_b32(), v229, v242); + svfloat32_t v309 = svadd_f32_x(svptrue_b32(), v239, v245); + svfloat32_t v310 = svsub_f32_x(svptrue_b32(), v239, v245); + svfloat32_t v247 = svadd_f32_x(svptrue_b32(), v237, v243); + svfloat32_t v248 = svsub_f32_x(svptrue_b32(), v237, v243); + svfloat32_t v249 = svadd_f32_x(svptrue_b32(), v240, v246); + svfloat32_t v250 = svsub_f32_x(svptrue_b32(), v240, v246); + svfloat32_t v281 = svadd_f32_x(svptrue_b32(), v277, v279); + svfloat32_t v282 = svsub_f32_x(svptrue_b32(), v277, v279); + svfloat32_t zero304 = svdup_n_f32(0); + svfloat32_t v304 = svcmla_f32_x(pred_full, zero304, v567, v280, 90); + svfloat32_t v311 = svadd_f32_x(svptrue_b32(), v307, v309); + svfloat32_t v312 = svsub_f32_x(svptrue_b32(), v307, v309); + svfloat32_t zero333 = svdup_n_f32(0); + svfloat32_t v333 = svcmla_f32_x(pred_full, zero333, v570, v308, 90); + svfloat32_t v251 = svadd_f32_x(svptrue_b32(), v247, v249); + svfloat32_t v252 = svsub_f32_x(svptrue_b32(), v247, v249); + svfloat32_t zero274 = svdup_n_f32(0); + svfloat32_t v274 = svcmla_f32_x(pred_full, zero274, v563, v250, 90); + svfloat32_t v305 = svmla_f32_x(pred_full, v304, v278, v566); + svfloat32_t v306 = svnmls_f32_x(pred_full, v304, v278, v566); + svfloat32_t zero319 = svdup_n_f32(0); + svfloat32_t v319 = svcmla_f32_x(pred_full, zero319, v570, v311, 90); + svfloat32_t zero326 = svdup_n_f32(0); + svfloat32_t v326 = svcmla_f32_x(pred_full, zero326, v570, v312, 90); + svfloat32_t v339 = svmla_f32_x(pred_full, v333, v310, v571); + svfloat32_t v340 = svmls_f32_x(pred_full, v333, v310, v571); + svfloat32_t v275 = svadd_f32_x(svptrue_b32(), v248, v274); + svfloat32_t v276 = svsub_f32_x(svptrue_b32(), v248, v274); + svfloat32_t v341 = svmla_f32_x(pred_full, v251, v281, v566); + svint16_t v346 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v251, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v395 = svmla_f32_x(pred_full, v252, v282, v566); + svint16_t v400 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v252, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v342 = svadd_f32_x(svptrue_b32(), v341, v319); + svfloat32_t v343 = svsub_f32_x(svptrue_b32(), v341, v319); + svfloat32_t v368 = svadd_f32_x(svptrue_b32(), v276, v306); + svint16_t v373 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v276, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v396 = svadd_f32_x(svptrue_b32(), v395, v326); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v395, v326); + svfloat32_t v422 = svadd_f32_x(svptrue_b32(), v275, v305); + svint16_t v427 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v275, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v579), v679, + svreinterpret_u64_s16(v346)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v633), v679, + svreinterpret_u64_s16(v400)); + svint16_t v354 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v343, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v362 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v342, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v369 = svadd_f32_x(svptrue_b32(), v368, v340); + svfloat32_t v370 = svsub_f32_x(svptrue_b32(), v368, v340); + svint16_t v408 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v397, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v416 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v396, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v423 = svadd_f32_x(svptrue_b32(), v422, v339); + svfloat32_t v424 = svsub_f32_x(svptrue_b32(), v422, v339); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v606), v679, + svreinterpret_u64_s16(v373)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v660), v679, + svreinterpret_u64_s16(v427)); + svint16_t v381 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v370, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v389 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v369, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v435 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v424, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v443 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v423, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v588), v679, + svreinterpret_u64_s16(v354)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v597), v679, + svreinterpret_u64_s16(v362)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v642), v679, + svreinterpret_u64_s16(v408)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v651), v679, + svreinterpret_u64_s16(v416)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v615), v679, + svreinterpret_u64_s16(v381)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v624), v679, + svreinterpret_u64_s16(v389)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v669), v679, + svreinterpret_u64_s16(v435)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v678), v679, + svreinterpret_u64_s16(v443)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs13(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v20 = v5[istride]; + float v441 = 1.0833333333333333e+00F; + float v445 = -3.0046260628866578e-01F; + float v448 = 7.4927933062613905e-01F; + float v449 = -7.4927933062613905e-01F; + float v455 = 4.0100212832186721e-01F; + float v456 = -4.0100212832186721e-01F; + float v462 = 5.7514072947400308e-01F; + float v463 = -5.7514072947400308e-01F; + float v470 = 5.2422663952658211e-01F; + float v474 = 5.1652078062348972e-01F; + float v478 = 7.7058589030924258e-03F; + float v482 = 4.2763404682656941e-01F; + float v486 = 1.5180597207438440e-01F; + float v490 = 5.7944001890096386e-01F; + float v493 = 1.1543953381323635e+00F; + float v494 = -1.1543953381323635e+00F; + float v500 = 9.0655220171271012e-01F; + float v501 = -9.0655220171271012e-01F; + float v507 = 8.1857027294591811e-01F; + float v508 = -8.1857027294591811e-01F; + float v514 = 1.1971367726043427e+00F; + float v515 = -1.1971367726043427e+00F; + float v521 = 8.6131170741789742e-01F; + float v522 = -8.6131170741789742e-01F; + float v528 = 1.1091548438375507e+00F; + float v529 = -1.1091548438375507e+00F; + float v535 = 4.2741434471979367e-02F; + float v536 = -4.2741434471979367e-02F; + float v542 = -4.5240494294812715e-02F; + float v543 = 4.5240494294812715e-02F; + float v549 = 2.9058457089163264e-01F; + float v550 = -2.9058457089163264e-01F; + float32x2_t v552 = (float32x2_t){v4, v4}; + float32x2_t v237 = vtrn1_f32(v20, v20); + float32x2_t v238 = vtrn2_f32(v20, v20); + float32x2_t v427 = v5[0]; + float32x2_t v442 = (float32x2_t){v441, v441}; + float32x2_t v446 = (float32x2_t){v445, v445}; + float32x2_t v450 = (float32x2_t){v448, v449}; + float32x2_t v457 = (float32x2_t){v455, v456}; + float32x2_t v464 = (float32x2_t){v462, v463}; + float32x2_t v471 = (float32x2_t){v470, v470}; + float32x2_t v475 = (float32x2_t){v474, v474}; + float32x2_t v479 = (float32x2_t){v478, v478}; + float32x2_t v483 = (float32x2_t){v482, v482}; + float32x2_t v487 = (float32x2_t){v486, v486}; + float32x2_t v491 = (float32x2_t){v490, v490}; + float32x2_t v495 = (float32x2_t){v493, v494}; + float32x2_t v502 = (float32x2_t){v500, v501}; + float32x2_t v509 = (float32x2_t){v507, v508}; + float32x2_t v516 = (float32x2_t){v514, v515}; + float32x2_t v523 = (float32x2_t){v521, v522}; + float32x2_t v530 = (float32x2_t){v528, v529}; + float32x2_t v537 = (float32x2_t){v535, v536}; + float32x2_t v544 = (float32x2_t){v542, v543}; + float32x2_t v551 = (float32x2_t){v549, v550}; + float32x2_t v38 = v5[istride * 12]; + float32x2_t v56 = v5[istride * 2]; + float32x2_t v74 = v5[istride * 11]; + float32x2_t v92 = v5[istride * 3]; + float32x2_t v110 = v5[istride * 10]; + float32x2_t v128 = v5[istride * 4]; + float32x2_t v146 = v5[istride * 9]; + float32x2_t v164 = v5[istride * 5]; + float32x2_t v182 = v5[istride * 8]; + float32x2_t v200 = v5[istride * 6]; + float32x2_t v218 = v5[istride * 7]; + float32x2_t v236 = v7[j * 24]; + int64_t v240 = j * 24 + 1; + int64_t v248 = 22 + j * 24; + int64_t v261 = 2 + j * 24; + int64_t v274 = 20 + j * 24; + int64_t v287 = 4 + j * 24; + int64_t v300 = 18 + j * 24; + int64_t v313 = 6 + j * 24; + int64_t v326 = 16 + j * 24; + int64_t v339 = 8 + j * 24; + int64_t v352 = 14 + j * 24; + int64_t v365 = 10 + j * 24; + int64_t v378 = 12 + j * 24; + float32x2_t v452 = vmul_f32(v552, v450); + float32x2_t v459 = vmul_f32(v552, v457); + float32x2_t v466 = vmul_f32(v552, v464); + float32x2_t v497 = vmul_f32(v552, v495); + float32x2_t v504 = vmul_f32(v552, v502); + float32x2_t v511 = vmul_f32(v552, v509); + float32x2_t v518 = vmul_f32(v552, v516); + float32x2_t v525 = vmul_f32(v552, v523); + float32x2_t v532 = vmul_f32(v552, v530); + float32x2_t v539 = vmul_f32(v552, v537); + float32x2_t v546 = vmul_f32(v552, v544); + float32x2_t v553 = vmul_f32(v552, v551); + float32x2_t v241 = v7[v240]; + float32x2_t v242 = vmul_f32(v237, v236); + float32x2_t v249 = v7[v248]; + float32x2_t v250 = vtrn1_f32(v38, v38); + float32x2_t v251 = vtrn2_f32(v38, v38); + int64_t v253 = v248 + 1; + float32x2_t v262 = v7[v261]; + float32x2_t v263 = vtrn1_f32(v56, v56); + float32x2_t v264 = vtrn2_f32(v56, v56); + int64_t v266 = v261 + 1; + float32x2_t v275 = v7[v274]; + float32x2_t v276 = vtrn1_f32(v74, v74); + float32x2_t v277 = vtrn2_f32(v74, v74); + int64_t v279 = v274 + 1; + float32x2_t v288 = v7[v287]; + float32x2_t v289 = vtrn1_f32(v92, v92); + float32x2_t v290 = vtrn2_f32(v92, v92); + int64_t v292 = v287 + 1; + float32x2_t v301 = v7[v300]; + float32x2_t v302 = vtrn1_f32(v110, v110); + float32x2_t v303 = vtrn2_f32(v110, v110); + int64_t v305 = v300 + 1; + float32x2_t v314 = v7[v313]; + float32x2_t v315 = vtrn1_f32(v128, v128); + float32x2_t v316 = vtrn2_f32(v128, v128); + int64_t v318 = v313 + 1; + float32x2_t v327 = v7[v326]; + float32x2_t v328 = vtrn1_f32(v146, v146); + float32x2_t v329 = vtrn2_f32(v146, v146); + int64_t v331 = v326 + 1; + float32x2_t v340 = v7[v339]; + float32x2_t v341 = vtrn1_f32(v164, v164); + float32x2_t v342 = vtrn2_f32(v164, v164); + int64_t v344 = v339 + 1; + float32x2_t v353 = v7[v352]; + float32x2_t v354 = vtrn1_f32(v182, v182); + float32x2_t v355 = vtrn2_f32(v182, v182); + int64_t v357 = v352 + 1; + float32x2_t v366 = v7[v365]; + float32x2_t v367 = vtrn1_f32(v200, v200); + float32x2_t v368 = vtrn2_f32(v200, v200); + int64_t v370 = v365 + 1; + float32x2_t v379 = v7[v378]; + float32x2_t v380 = vtrn1_f32(v218, v218); + float32x2_t v381 = vtrn2_f32(v218, v218); + int64_t v383 = v378 + 1; + float32x2_t v254 = v7[v253]; + float32x2_t v255 = vmul_f32(v250, v249); + float32x2_t v267 = v7[v266]; + float32x2_t v268 = vmul_f32(v263, v262); + float32x2_t v280 = v7[v279]; + float32x2_t v281 = vmul_f32(v276, v275); + float32x2_t v293 = v7[v292]; + float32x2_t v294 = vmul_f32(v289, v288); + float32x2_t v306 = v7[v305]; + float32x2_t v307 = vmul_f32(v302, v301); + float32x2_t v319 = v7[v318]; + float32x2_t v320 = vmul_f32(v315, v314); + float32x2_t v332 = v7[v331]; + float32x2_t v333 = vmul_f32(v328, v327); + float32x2_t v345 = v7[v344]; + float32x2_t v346 = vmul_f32(v341, v340); + float32x2_t v358 = v7[v357]; + float32x2_t v359 = vmul_f32(v354, v353); + float32x2_t v371 = v7[v370]; + float32x2_t v372 = vmul_f32(v367, v366); + float32x2_t v384 = v7[v383]; + float32x2_t v385 = vmul_f32(v380, v379); + float32x2_t v244 = vfma_f32(v242, v238, v241); + float32x2_t v257 = vfma_f32(v255, v251, v254); + float32x2_t v270 = vfma_f32(v268, v264, v267); + float32x2_t v283 = vfma_f32(v281, v277, v280); + float32x2_t v296 = vfma_f32(v294, v290, v293); + float32x2_t v309 = vfma_f32(v307, v303, v306); + float32x2_t v322 = vfma_f32(v320, v316, v319); + float32x2_t v335 = vfma_f32(v333, v329, v332); + float32x2_t v348 = vfma_f32(v346, v342, v345); + float32x2_t v361 = vfma_f32(v359, v355, v358); + float32x2_t v374 = vfma_f32(v372, v368, v371); + float32x2_t v387 = vfma_f32(v385, v381, v384); + float32x2_t v388 = vadd_f32(v244, v257); + float32x2_t v389 = vadd_f32(v270, v283); + float32x2_t v390 = vadd_f32(v296, v309); + float32x2_t v391 = vadd_f32(v322, v335); + float32x2_t v392 = vadd_f32(v348, v361); + float32x2_t v393 = vadd_f32(v374, v387); + float32x2_t v394 = vsub_f32(v244, v257); + float32x2_t v395 = vsub_f32(v270, v283); + float32x2_t v396 = vsub_f32(v296, v309); + float32x2_t v397 = vsub_f32(v322, v335); + float32x2_t v398 = vsub_f32(v348, v361); + float32x2_t v399 = vsub_f32(v374, v387); + float32x2_t v400 = vadd_f32(v389, v392); + float32x2_t v402 = vadd_f32(v388, v390); + float32x2_t v405 = vadd_f32(v395, v398); + float32x2_t v407 = vadd_f32(v394, v396); + float32x2_t v409 = vsub_f32(v389, v393); + float32x2_t v410 = vsub_f32(v390, v391); + float32x2_t v411 = vsub_f32(v388, v391); + float32x2_t v412 = vsub_f32(v392, v393); + float32x2_t v417 = vsub_f32(v395, v399); + float32x2_t v418 = vsub_f32(v394, v396); + float32x2_t v419 = vsub_f32(v395, v398); + float32x2_t v420 = vadd_f32(v394, v397); + float32x2_t v421 = vsub_f32(v398, v399); + float32x2_t v422 = vadd_f32(v396, v397); + float32x2_t v401 = vadd_f32(v400, v393); + float32x2_t v403 = vadd_f32(v402, v391); + float32x2_t v406 = vadd_f32(v405, v399); + float32x2_t v408 = vsub_f32(v407, v397); + float32x2_t v413 = vsub_f32(v409, v410); + float32x2_t v414 = vsub_f32(v411, v412); + float32x2_t v415 = vadd_f32(v409, v410); + float32x2_t v416 = vadd_f32(v411, v412); + float32x2_t v433 = vadd_f32(v417, v418); + float32x2_t v434 = vadd_f32(v419, v420); + float32x2_t v435 = vsub_f32(v421, v422); + float32x2_t v498 = vrev64_f32(v417); + float32x2_t v505 = vrev64_f32(v418); + float32x2_t v519 = vrev64_f32(v419); + float32x2_t v526 = vrev64_f32(v420); + float32x2_t v540 = vrev64_f32(v421); + float32x2_t v547 = vrev64_f32(v422); + float32x2_t v404 = vadd_f32(v401, v403); + float32x2_t v429 = vsub_f32(v403, v401); + float32x2_t v430 = vadd_f32(v406, v408); + float32x2_t v431 = vadd_f32(v413, v414); + float32x2_t v432 = vsub_f32(v415, v416); + float32x2_t v453 = vrev64_f32(v406); + float32x2_t v460 = vrev64_f32(v408); + float32x2_t v472 = vmul_f32(v413, v471); + float32x2_t v476 = vmul_f32(v414, v475); + float32x2_t v484 = vmul_f32(v415, v483); + float32x2_t v488 = vmul_f32(v416, v487); + float32x2_t v499 = vmul_f32(v498, v497); + float32x2_t v506 = vmul_f32(v505, v504); + float32x2_t v512 = vrev64_f32(v433); + float32x2_t v520 = vmul_f32(v519, v518); + float32x2_t v527 = vmul_f32(v526, v525); + float32x2_t v533 = vrev64_f32(v434); + float32x2_t v541 = vmul_f32(v540, v539); + float32x2_t v548 = vmul_f32(v547, v546); + float32x2_t v554 = vrev64_f32(v435); + float32x2_t v428 = vadd_f32(v427, v404); + float32x2_t v443 = vmul_f32(v404, v442); + float32x2_t v447 = vmul_f32(v429, v446); + float32x2_t v454 = vmul_f32(v453, v452); + float32x2_t v461 = vmul_f32(v460, v459); + float32x2_t v467 = vrev64_f32(v430); + float32x2_t v480 = vmul_f32(v431, v479); + float32x2_t v492 = vmul_f32(v432, v491); + float32x2_t v513 = vmul_f32(v512, v511); + float32x2_t v534 = vmul_f32(v533, v532); + float32x2_t v555 = vmul_f32(v554, v553); + float32x2_t v557 = vadd_f32(v476, v472); + float32x2_t v468 = vmul_f32(v467, v466); + float32x2_t v556 = vsub_f32(v428, v443); + float32x2_t v558 = vsub_f32(v557, v447); + float32x2_t v559 = vadd_f32(v476, v480); + float32x2_t v561 = vsub_f32(v480, v472); + float32x2_t v569 = vsub_f32(v499, v513); + float32x2_t v570 = vsub_f32(v506, v513); + float32x2_t v571 = vsub_f32(v520, v534); + float32x2_t v572 = vsub_f32(v527, v534); + float32x2_t v573 = vsub_f32(v541, v555); + float32x2_t v574 = vadd_f32(v548, v555); + int16x4_t v609 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v428, 15), (int32x2_t){0, 0})); + float32x2_t v560 = vadd_f32(v559, v447); + float32x2_t v562 = vsub_f32(v561, v447); + float32x2_t v563 = vadd_f32(v556, v484); + float32x2_t v565 = vsub_f32(v556, v488); + float32x2_t v567 = vsub_f32(v556, v484); + float32x2_t v575 = vsub_f32(v454, v468); + float32x2_t v576 = vsub_f32(v461, v468); + float32x2_t v587 = vadd_f32(v569, v573); + float32x2_t v589 = vadd_f32(v571, v573); + float32x2_t v591 = vsub_f32(v570, v574); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v609), 0); + float32x2_t v564 = vadd_f32(v563, v488); + float32x2_t v566 = vsub_f32(v565, v492); + float32x2_t v568 = vadd_f32(v567, v492); + float32x2_t v583 = vsub_f32(v576, v569); + float32x2_t v585 = vsub_f32(v574, v575); + float32x2_t v588 = vadd_f32(v587, v576); + float32x2_t v590 = vsub_f32(v589, v576); + float32x2_t v592 = vsub_f32(v591, v575); + float32x2_t v593 = vadd_f32(v575, v570); + float32x2_t v577 = vadd_f32(v558, v564); + float32x2_t v578 = vadd_f32(v560, v566); + float32x2_t v579 = vsub_f32(v566, v560); + float32x2_t v580 = vadd_f32(v562, v568); + float32x2_t v581 = vsub_f32(v564, v558); + float32x2_t v582 = vsub_f32(v568, v562); + float32x2_t v584 = vadd_f32(v583, v571); + float32x2_t v586 = vsub_f32(v585, v572); + float32x2_t v594 = vsub_f32(v593, v572); + float32x2_t v595 = vsub_f32(v577, v584); + float32x2_t v596 = vadd_f32(v578, v586); + float32x2_t v597 = vsub_f32(v579, v588); + float32x2_t v598 = vsub_f32(v580, v590); + float32x2_t v599 = vadd_f32(v581, v592); + float32x2_t v600 = vsub_f32(v582, v594); + float32x2_t v601 = vadd_f32(v582, v594); + float32x2_t v602 = vsub_f32(v581, v592); + float32x2_t v603 = vadd_f32(v580, v590); + float32x2_t v604 = vadd_f32(v579, v588); + float32x2_t v605 = vsub_f32(v578, v586); + float32x2_t v606 = vadd_f32(v577, v584); + int16x4_t v615 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v595, 15), (int32x2_t){0, 0})); + int16x4_t v621 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v596, 15), (int32x2_t){0, 0})); + int16x4_t v627 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v597, 15), (int32x2_t){0, 0})); + int16x4_t v633 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v598, 15), (int32x2_t){0, 0})); + int16x4_t v639 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v599, 15), (int32x2_t){0, 0})); + int16x4_t v645 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v600, 15), (int32x2_t){0, 0})); + int16x4_t v651 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v601, 15), (int32x2_t){0, 0})); + int16x4_t v657 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v602, 15), (int32x2_t){0, 0})); + int16x4_t v663 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v603, 15), (int32x2_t){0, 0})); + int16x4_t v669 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v604, 15), (int32x2_t){0, 0})); + int16x4_t v675 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v605, 15), (int32x2_t){0, 0})); + int16x4_t v681 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v606, 15), (int32x2_t){0, 0})); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v615), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v621), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v627), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v633), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v639), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v645), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v651), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v657), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v663), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v669), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v675), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v681), 0); + v5 += 1 * idist; + v6 += 1 * odist; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs13(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + int64_t v3 = odist; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * v3; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + int64_t v13 = j; + float v326 = 1.0833333333333333e+00F; + float v331 = -3.0046260628866578e-01F; + float v336 = -7.4927933062613905e-01F; + float v343 = -4.0100212832186721e-01F; + float v350 = -5.7514072947400308e-01F; + float v357 = 5.2422663952658211e-01F; + float v362 = 5.1652078062348972e-01F; + float v367 = 7.7058589030924258e-03F; + float v372 = 4.2763404682656941e-01F; + float v377 = 1.5180597207438440e-01F; + float v382 = 5.7944001890096386e-01F; + float v387 = -1.1543953381323635e+00F; + float v394 = -9.0655220171271012e-01F; + float v401 = -8.1857027294591811e-01F; + float v408 = -1.1971367726043427e+00F; + float v415 = -8.6131170741789742e-01F; + float v422 = -1.1091548438375507e+00F; + float v429 = -4.2741434471979367e-02F; + float v436 = 4.5240494294812715e-02F; + float v443 = -2.9058457089163264e-01F; + const float32x2_t *v610 = &v5[v0]; + int32_t *v860 = &v6[v2]; + int64_t v33 = v0 * 12; + int64_t v47 = v0 * 2; + int64_t v61 = v0 * 11; + int64_t v75 = v0 * 3; + int64_t v89 = v0 * 10; + int64_t v103 = v0 * 4; + int64_t v117 = v0 * 9; + int64_t v131 = v0 * 5; + int64_t v145 = v0 * 8; + int64_t v159 = v0 * 6; + int64_t v173 = v0 * 7; + int64_t v195 = v10 * 11; + int64_t v209 = v10 * 10; + int64_t v216 = v10 * 2; + int64_t v223 = v10 * 9; + int64_t v230 = v10 * 3; + int64_t v237 = v10 * 8; + int64_t v244 = v10 * 4; + int64_t v251 = v10 * 7; + int64_t v258 = v10 * 5; + int64_t v265 = v10 * 6; + int64_t v266 = v13 * 12; + float v339 = v4 * v336; + float v346 = v4 * v343; + float v353 = v4 * v350; + float v390 = v4 * v387; + float v397 = v4 * v394; + float v404 = v4 * v401; + float v411 = v4 * v408; + float v418 = v4 * v415; + float v425 = v4 * v422; + float v432 = v4 * v429; + float v439 = v4 * v436; + float v446 = v4 * v443; + int64_t v509 = v2 * 12; + int64_t v517 = v2 * 11; + int64_t v525 = v2 * 10; + int64_t v533 = v2 * 9; + int64_t v541 = v2 * 8; + int64_t v549 = v2 * 7; + int64_t v557 = v2 * 6; + int64_t v565 = v2 * 5; + int64_t v573 = v2 * 4; + int64_t v581 = v2 * 3; + int64_t v589 = v2 * 2; + const float32x2_t *v721 = &v5[0]; + svint64_t v722 = svindex_s64(0, v1); + svfloat32_t v725 = svdup_n_f32(v326); + svfloat32_t v726 = svdup_n_f32(v331); + svfloat32_t v730 = svdup_n_f32(v357); + svfloat32_t v731 = svdup_n_f32(v362); + svfloat32_t v732 = svdup_n_f32(v367); + svfloat32_t v733 = svdup_n_f32(v372); + svfloat32_t v734 = svdup_n_f32(v377); + svfloat32_t v735 = svdup_n_f32(v382); + int32_t *v752 = &v6[0]; + svint64_t v861 = svindex_s64(0, v3); + svfloat32_t v191 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v266])); + int64_t v197 = v195 + v266; + int64_t v204 = v10 + v266; + int64_t v211 = v209 + v266; + int64_t v218 = v216 + v266; + int64_t v225 = v223 + v266; + int64_t v232 = v230 + v266; + int64_t v239 = v237 + v266; + int64_t v246 = v244 + v266; + int64_t v253 = v251 + v266; + int64_t v260 = v258 + v266; + int64_t v267 = v265 + v266; + svfloat32_t v612 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v610), v722)); + const float32x2_t *v620 = &v5[v33]; + const float32x2_t *v629 = &v5[v47]; + const float32x2_t *v638 = &v5[v61]; + const float32x2_t *v647 = &v5[v75]; + const float32x2_t *v656 = &v5[v89]; + const float32x2_t *v665 = &v5[v103]; + const float32x2_t *v674 = &v5[v117]; + const float32x2_t *v683 = &v5[v131]; + const float32x2_t *v692 = &v5[v145]; + const float32x2_t *v701 = &v5[v159]; + const float32x2_t *v710 = &v5[v173]; + svfloat32_t v723 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v721), v722)); + svfloat32_t v727 = svdup_n_f32(v339); + svfloat32_t v728 = svdup_n_f32(v346); + svfloat32_t v729 = svdup_n_f32(v353); + svfloat32_t v736 = svdup_n_f32(v390); + svfloat32_t v737 = svdup_n_f32(v397); + svfloat32_t v738 = svdup_n_f32(v404); + svfloat32_t v739 = svdup_n_f32(v411); + svfloat32_t v740 = svdup_n_f32(v418); + svfloat32_t v741 = svdup_n_f32(v425); + svfloat32_t v742 = svdup_n_f32(v432); + svfloat32_t v743 = svdup_n_f32(v439); + svfloat32_t v744 = svdup_n_f32(v446); + int32_t *v761 = &v6[v509]; + int32_t *v770 = &v6[v517]; + int32_t *v779 = &v6[v525]; + int32_t *v788 = &v6[v533]; + int32_t *v797 = &v6[v541]; + int32_t *v806 = &v6[v549]; + int32_t *v815 = &v6[v557]; + int32_t *v824 = &v6[v565]; + int32_t *v833 = &v6[v573]; + int32_t *v842 = &v6[v581]; + int32_t *v851 = &v6[v589]; + svfloat32_t zero192 = svdup_n_f32(0); + svfloat32_t v192 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero192, v612, v191, 0), + v612, v191, 90); + svfloat32_t v198 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v197])); + svfloat32_t v205 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v204])); + svfloat32_t v212 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v211])); + svfloat32_t v219 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v218])); + svfloat32_t v226 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v225])); + svfloat32_t v233 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v232])); + svfloat32_t v240 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v239])); + svfloat32_t v247 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v246])); + svfloat32_t v254 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v253])); + svfloat32_t v261 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v260])); + svfloat32_t v268 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v267])); + svfloat32_t v622 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v620), v722)); + svfloat32_t v631 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v629), v722)); + svfloat32_t v640 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v638), v722)); + svfloat32_t v649 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v647), v722)); + svfloat32_t v658 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v656), v722)); + svfloat32_t v667 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v665), v722)); + svfloat32_t v676 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v674), v722)); + svfloat32_t v685 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v683), v722)); + svfloat32_t v694 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v692), v722)); + svfloat32_t v703 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v701), v722)); + svfloat32_t v712 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v710), v722)); + svfloat32_t zero199 = svdup_n_f32(0); + svfloat32_t v199 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero199, v622, v198, 0), + v622, v198, 90); + svfloat32_t zero206 = svdup_n_f32(0); + svfloat32_t v206 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v631, v205, 0), + v631, v205, 90); + svfloat32_t zero213 = svdup_n_f32(0); + svfloat32_t v213 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero213, v640, v212, 0), + v640, v212, 90); + svfloat32_t zero220 = svdup_n_f32(0); + svfloat32_t v220 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero220, v649, v219, 0), + v649, v219, 90); + svfloat32_t zero227 = svdup_n_f32(0); + svfloat32_t v227 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero227, v658, v226, 0), + v658, v226, 90); + svfloat32_t zero234 = svdup_n_f32(0); + svfloat32_t v234 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero234, v667, v233, 0), + v667, v233, 90); + svfloat32_t zero241 = svdup_n_f32(0); + svfloat32_t v241 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero241, v676, v240, 0), + v676, v240, 90); + svfloat32_t zero248 = svdup_n_f32(0); + svfloat32_t v248 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero248, v685, v247, 0), + v685, v247, 90); + svfloat32_t zero255 = svdup_n_f32(0); + svfloat32_t v255 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero255, v694, v254, 0), + v694, v254, 90); + svfloat32_t zero262 = svdup_n_f32(0); + svfloat32_t v262 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero262, v703, v261, 0), + v703, v261, 90); + svfloat32_t zero269 = svdup_n_f32(0); + svfloat32_t v269 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero269, v712, v268, 0), + v712, v268, 90); + svfloat32_t v270 = svadd_f32_x(svptrue_b32(), v192, v199); + svfloat32_t v271 = svadd_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v272 = svadd_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v273 = svadd_f32_x(svptrue_b32(), v234, v241); + svfloat32_t v274 = svadd_f32_x(svptrue_b32(), v248, v255); + svfloat32_t v275 = svadd_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v276 = svsub_f32_x(svptrue_b32(), v192, v199); + svfloat32_t v277 = svsub_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v278 = svsub_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v279 = svsub_f32_x(svptrue_b32(), v234, v241); + svfloat32_t v280 = svsub_f32_x(svptrue_b32(), v248, v255); + svfloat32_t v281 = svsub_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v282 = svadd_f32_x(svptrue_b32(), v271, v274); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v270, v272); + svfloat32_t v287 = svadd_f32_x(svptrue_b32(), v277, v280); + svfloat32_t v289 = svadd_f32_x(svptrue_b32(), v276, v278); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v271, v275); + svfloat32_t v292 = svsub_f32_x(svptrue_b32(), v272, v273); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v270, v273); + svfloat32_t v294 = svsub_f32_x(svptrue_b32(), v274, v275); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v277, v281); + svfloat32_t v300 = svsub_f32_x(svptrue_b32(), v276, v278); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v277, v280); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v276, v279); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v280, v281); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v278, v279); + svfloat32_t v283 = svadd_f32_x(svptrue_b32(), v282, v275); + svfloat32_t v285 = svadd_f32_x(svptrue_b32(), v284, v273); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v287, v281); + svfloat32_t v290 = svsub_f32_x(svptrue_b32(), v289, v279); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v291, v292); + svfloat32_t v296 = svsub_f32_x(svptrue_b32(), v293, v294); + svfloat32_t v297 = svadd_f32_x(svptrue_b32(), v291, v292); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v293, v294); + svfloat32_t v317 = svadd_f32_x(svptrue_b32(), v299, v300); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v301, v302); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v303, v304); + svfloat32_t zero392 = svdup_n_f32(0); + svfloat32_t v392 = svcmla_f32_x(pred_full, zero392, v736, v299, 90); + svfloat32_t zero399 = svdup_n_f32(0); + svfloat32_t v399 = svcmla_f32_x(pred_full, zero399, v737, v300, 90); + svfloat32_t zero413 = svdup_n_f32(0); + svfloat32_t v413 = svcmla_f32_x(pred_full, zero413, v739, v301, 90); + svfloat32_t zero420 = svdup_n_f32(0); + svfloat32_t v420 = svcmla_f32_x(pred_full, zero420, v740, v302, 90); + svfloat32_t zero434 = svdup_n_f32(0); + svfloat32_t v434 = svcmla_f32_x(pred_full, zero434, v742, v303, 90); + svfloat32_t v286 = svadd_f32_x(svptrue_b32(), v283, v285); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v285, v283); + svfloat32_t v314 = svadd_f32_x(svptrue_b32(), v288, v290); + svfloat32_t v315 = svadd_f32_x(svptrue_b32(), v295, v296); + svfloat32_t v316 = svsub_f32_x(svptrue_b32(), v297, v298); + svfloat32_t zero341 = svdup_n_f32(0); + svfloat32_t v341 = svcmla_f32_x(pred_full, zero341, v727, v288, 90); + svfloat32_t zero348 = svdup_n_f32(0); + svfloat32_t v348 = svcmla_f32_x(pred_full, zero348, v728, v290, 90); + svfloat32_t v360 = svmul_f32_x(svptrue_b32(), v295, v730); + svfloat32_t zero406 = svdup_n_f32(0); + svfloat32_t v406 = svcmla_f32_x(pred_full, zero406, v738, v317, 90); + svfloat32_t zero427 = svdup_n_f32(0); + svfloat32_t v427 = svcmla_f32_x(pred_full, zero427, v741, v318, 90); + svfloat32_t zero448 = svdup_n_f32(0); + svfloat32_t v448 = svcmla_f32_x(pred_full, zero448, v744, v319, 90); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v723, v286); + svfloat32_t zero355 = svdup_n_f32(0); + svfloat32_t v355 = svcmla_f32_x(pred_full, zero355, v729, v314, 90); + svfloat32_t v370 = svmul_f32_x(svptrue_b32(), v315, v732); + svfloat32_t v450 = svmla_f32_x(pred_full, v360, v296, v731); + svfloat32_t v462 = svsub_f32_x(svptrue_b32(), v392, v406); + svfloat32_t v463 = svsub_f32_x(svptrue_b32(), v399, v406); + svfloat32_t v464 = svsub_f32_x(svptrue_b32(), v413, v427); + svfloat32_t v465 = svsub_f32_x(svptrue_b32(), v420, v427); + svfloat32_t v466 = svsub_f32_x(svptrue_b32(), v434, v448); + svfloat32_t v467 = svcmla_f32_x(pred_full, v448, v743, v304, 90); + svfloat32_t v449 = svmls_f32_x(pred_full, v312, v286, v725); + svfloat32_t v451 = svmls_f32_x(pred_full, v450, v313, v726); + svfloat32_t v452 = svmla_f32_x(pred_full, v370, v296, v731); + svfloat32_t v454 = svnmls_f32_x(pred_full, v360, v315, v732); + svfloat32_t v468 = svsub_f32_x(svptrue_b32(), v341, v355); + svfloat32_t v469 = svsub_f32_x(svptrue_b32(), v348, v355); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v462, v466); + svfloat32_t v482 = svadd_f32_x(svptrue_b32(), v464, v466); + svfloat32_t v484 = svsub_f32_x(svptrue_b32(), v463, v467); + svint16_t v502 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v312, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v453 = svmla_f32_x(pred_full, v452, v313, v726); + svfloat32_t v455 = svmls_f32_x(pred_full, v454, v313, v726); + svfloat32_t v456 = svmla_f32_x(pred_full, v449, v297, v733); + svfloat32_t v458 = svmls_f32_x(pred_full, v449, v298, v734); + svfloat32_t v460 = svmls_f32_x(pred_full, v449, v297, v733); + svfloat32_t v476 = svsub_f32_x(svptrue_b32(), v469, v462); + svfloat32_t v478 = svsub_f32_x(svptrue_b32(), v467, v468); + svfloat32_t v481 = svadd_f32_x(svptrue_b32(), v480, v469); + svfloat32_t v483 = svsub_f32_x(svptrue_b32(), v482, v469); + svfloat32_t v485 = svsub_f32_x(svptrue_b32(), v484, v468); + svfloat32_t v486 = svadd_f32_x(svptrue_b32(), v468, v463); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v752), v861, + svreinterpret_u64_s16(v502)); + svfloat32_t v457 = svmla_f32_x(pred_full, v456, v298, v734); + svfloat32_t v459 = svmls_f32_x(pred_full, v458, v316, v735); + svfloat32_t v461 = svmla_f32_x(pred_full, v460, v316, v735); + svfloat32_t v477 = svadd_f32_x(svptrue_b32(), v476, v464); + svfloat32_t v479 = svsub_f32_x(svptrue_b32(), v478, v465); + svfloat32_t v487 = svsub_f32_x(svptrue_b32(), v486, v465); + svfloat32_t v470 = svadd_f32_x(svptrue_b32(), v451, v457); + svfloat32_t v471 = svadd_f32_x(svptrue_b32(), v453, v459); + svfloat32_t v472 = svsub_f32_x(svptrue_b32(), v459, v453); + svfloat32_t v473 = svadd_f32_x(svptrue_b32(), v455, v461); + svfloat32_t v474 = svsub_f32_x(svptrue_b32(), v457, v451); + svfloat32_t v475 = svsub_f32_x(svptrue_b32(), v461, v455); + svfloat32_t v488 = svsub_f32_x(svptrue_b32(), v470, v477); + svfloat32_t v489 = svadd_f32_x(svptrue_b32(), v471, v479); + svfloat32_t v490 = svsub_f32_x(svptrue_b32(), v472, v481); + svfloat32_t v491 = svsub_f32_x(svptrue_b32(), v473, v483); + svfloat32_t v492 = svadd_f32_x(svptrue_b32(), v474, v485); + svfloat32_t v493 = svsub_f32_x(svptrue_b32(), v475, v487); + svfloat32_t v494 = svadd_f32_x(svptrue_b32(), v475, v487); + svfloat32_t v495 = svsub_f32_x(svptrue_b32(), v474, v485); + svfloat32_t v496 = svadd_f32_x(svptrue_b32(), v473, v483); + svfloat32_t v497 = svadd_f32_x(svptrue_b32(), v472, v481); + svfloat32_t v498 = svsub_f32_x(svptrue_b32(), v471, v479); + svfloat32_t v499 = svadd_f32_x(svptrue_b32(), v470, v477); + svint16_t v510 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v488, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v518 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v489, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v526 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v490, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v534 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v491, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v542 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v492, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v550 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v493, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v558 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v494, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v566 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v495, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v574 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v496, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v582 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v497, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v590 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v498, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v598 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v499, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v761), v861, + svreinterpret_u64_s16(v510)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v770), v861, + svreinterpret_u64_s16(v518)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v779), v861, + svreinterpret_u64_s16(v526)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v788), v861, + svreinterpret_u64_s16(v534)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v797), v861, + svreinterpret_u64_s16(v542)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v806), v861, + svreinterpret_u64_s16(v550)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v815), v861, + svreinterpret_u64_s16(v558)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v824), v861, + svreinterpret_u64_s16(v566)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v833), v861, + svreinterpret_u64_s16(v574)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v842), v861, + svreinterpret_u64_s16(v582)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v851), v861, + svreinterpret_u64_s16(v590)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v860), v861, + svreinterpret_u64_s16(v598)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs14(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v255 = v5[istride]; + float v544 = -1.1666666666666665e+00F; + float v548 = 7.9015646852540022e-01F; + float v552 = 5.5854267289647742e-02F; + float v556 = 7.3430220123575241e-01F; + float v559 = 4.4095855184409838e-01F; + float v560 = -4.4095855184409838e-01F; + float v566 = 3.4087293062393137e-01F; + float v567 = -3.4087293062393137e-01F; + float v573 = -5.3396936033772524e-01F; + float v574 = 5.3396936033772524e-01F; + float v580 = 8.7484229096165667e-01F; + float v581 = -8.7484229096165667e-01F; + float32x2_t v583 = (float32x2_t){v4, v4}; + float32x2_t v287 = vtrn1_f32(v255, v255); + float32x2_t v288 = vtrn2_f32(v255, v255); + float32x2_t v423 = v5[0]; + float32x2_t v545 = (float32x2_t){v544, v544}; + float32x2_t v549 = (float32x2_t){v548, v548}; + float32x2_t v553 = (float32x2_t){v552, v552}; + float32x2_t v557 = (float32x2_t){v556, v556}; + float32x2_t v561 = (float32x2_t){v559, v560}; + float32x2_t v568 = (float32x2_t){v566, v567}; + float32x2_t v575 = (float32x2_t){v573, v574}; + float32x2_t v582 = (float32x2_t){v580, v581}; + float32x2_t v20 = v5[istride * 7]; + int64_t v37 = 12 + j * 26; + float32x2_t v51 = v5[istride * 2]; + float32x2_t v69 = v5[istride * 9]; + int64_t v86 = 2 + j * 26; + int64_t v99 = 16 + j * 26; + float32x2_t v113 = v5[istride * 4]; + float32x2_t v131 = v5[istride * 11]; + int64_t v148 = 6 + j * 26; + int64_t v161 = 20 + j * 26; + float32x2_t v175 = v5[istride * 6]; + float32x2_t v193 = v5[istride * 13]; + int64_t v210 = 10 + j * 26; + int64_t v223 = 24 + j * 26; + float32x2_t v237 = v5[istride * 8]; + int64_t v272 = 14 + j * 26; + float32x2_t v286 = v7[j * 26]; + int64_t v290 = j * 26 + 1; + float32x2_t v299 = v5[istride * 10]; + float32x2_t v317 = v5[istride * 3]; + int64_t v334 = 18 + j * 26; + int64_t v347 = 4 + j * 26; + float32x2_t v361 = v5[istride * 12]; + float32x2_t v379 = v5[istride * 5]; + int64_t v396 = 22 + j * 26; + int64_t v409 = 8 + j * 26; + float32x2_t v563 = vmul_f32(v583, v561); + float32x2_t v570 = vmul_f32(v583, v568); + float32x2_t v577 = vmul_f32(v583, v575); + float32x2_t v584 = vmul_f32(v583, v582); + float32x2_t v38 = v7[v37]; + float32x2_t v39 = vtrn1_f32(v20, v20); + float32x2_t v40 = vtrn2_f32(v20, v20); + int64_t v42 = v37 + 1; + float32x2_t v87 = v7[v86]; + float32x2_t v88 = vtrn1_f32(v51, v51); + float32x2_t v89 = vtrn2_f32(v51, v51); + int64_t v91 = v86 + 1; + float32x2_t v100 = v7[v99]; + float32x2_t v101 = vtrn1_f32(v69, v69); + float32x2_t v102 = vtrn2_f32(v69, v69); + int64_t v104 = v99 + 1; + float32x2_t v149 = v7[v148]; + float32x2_t v150 = vtrn1_f32(v113, v113); + float32x2_t v151 = vtrn2_f32(v113, v113); + int64_t v153 = v148 + 1; + float32x2_t v162 = v7[v161]; + float32x2_t v163 = vtrn1_f32(v131, v131); + float32x2_t v164 = vtrn2_f32(v131, v131); + int64_t v166 = v161 + 1; + float32x2_t v211 = v7[v210]; + float32x2_t v212 = vtrn1_f32(v175, v175); + float32x2_t v213 = vtrn2_f32(v175, v175); + int64_t v215 = v210 + 1; + float32x2_t v224 = v7[v223]; + float32x2_t v225 = vtrn1_f32(v193, v193); + float32x2_t v226 = vtrn2_f32(v193, v193); + int64_t v228 = v223 + 1; + float32x2_t v273 = v7[v272]; + float32x2_t v274 = vtrn1_f32(v237, v237); + float32x2_t v275 = vtrn2_f32(v237, v237); + int64_t v277 = v272 + 1; + float32x2_t v291 = v7[v290]; + float32x2_t v292 = vmul_f32(v287, v286); + float32x2_t v335 = v7[v334]; + float32x2_t v336 = vtrn1_f32(v299, v299); + float32x2_t v337 = vtrn2_f32(v299, v299); + int64_t v339 = v334 + 1; + float32x2_t v348 = v7[v347]; + float32x2_t v349 = vtrn1_f32(v317, v317); + float32x2_t v350 = vtrn2_f32(v317, v317); + int64_t v352 = v347 + 1; + float32x2_t v397 = v7[v396]; + float32x2_t v398 = vtrn1_f32(v361, v361); + float32x2_t v399 = vtrn2_f32(v361, v361); + int64_t v401 = v396 + 1; + float32x2_t v410 = v7[v409]; + float32x2_t v411 = vtrn1_f32(v379, v379); + float32x2_t v412 = vtrn2_f32(v379, v379); + int64_t v414 = v409 + 1; + float32x2_t v43 = v7[v42]; + float32x2_t v44 = vmul_f32(v39, v38); + float32x2_t v92 = v7[v91]; + float32x2_t v93 = vmul_f32(v88, v87); + float32x2_t v105 = v7[v104]; + float32x2_t v106 = vmul_f32(v101, v100); + float32x2_t v154 = v7[v153]; + float32x2_t v155 = vmul_f32(v150, v149); + float32x2_t v167 = v7[v166]; + float32x2_t v168 = vmul_f32(v163, v162); + float32x2_t v216 = v7[v215]; + float32x2_t v217 = vmul_f32(v212, v211); + float32x2_t v229 = v7[v228]; + float32x2_t v230 = vmul_f32(v225, v224); + float32x2_t v278 = v7[v277]; + float32x2_t v279 = vmul_f32(v274, v273); + float32x2_t v340 = v7[v339]; + float32x2_t v341 = vmul_f32(v336, v335); + float32x2_t v353 = v7[v352]; + float32x2_t v354 = vmul_f32(v349, v348); + float32x2_t v402 = v7[v401]; + float32x2_t v403 = vmul_f32(v398, v397); + float32x2_t v415 = v7[v414]; + float32x2_t v416 = vmul_f32(v411, v410); + float32x2_t v294 = vfma_f32(v292, v288, v291); + float32x2_t v46 = vfma_f32(v44, v40, v43); + float32x2_t v95 = vfma_f32(v93, v89, v92); + float32x2_t v108 = vfma_f32(v106, v102, v105); + float32x2_t v157 = vfma_f32(v155, v151, v154); + float32x2_t v170 = vfma_f32(v168, v164, v167); + float32x2_t v219 = vfma_f32(v217, v213, v216); + float32x2_t v232 = vfma_f32(v230, v226, v229); + float32x2_t v281 = vfma_f32(v279, v275, v278); + float32x2_t v343 = vfma_f32(v341, v337, v340); + float32x2_t v356 = vfma_f32(v354, v350, v353); + float32x2_t v405 = vfma_f32(v403, v399, v402); + float32x2_t v418 = vfma_f32(v416, v412, v415); + float32x2_t v424 = vadd_f32(v423, v46); + float32x2_t v425 = vsub_f32(v423, v46); + float32x2_t v426 = vadd_f32(v95, v108); + float32x2_t v427 = vsub_f32(v95, v108); + float32x2_t v428 = vadd_f32(v157, v170); + float32x2_t v429 = vsub_f32(v157, v170); + float32x2_t v430 = vadd_f32(v219, v232); + float32x2_t v431 = vsub_f32(v219, v232); + float32x2_t v432 = vadd_f32(v281, v294); + float32x2_t v433 = vsub_f32(v281, v294); + float32x2_t v434 = vadd_f32(v343, v356); + float32x2_t v435 = vsub_f32(v343, v356); + float32x2_t v436 = vadd_f32(v405, v418); + float32x2_t v437 = vsub_f32(v405, v418); + float32x2_t v438 = vadd_f32(v426, v436); + float32x2_t v439 = vsub_f32(v426, v436); + float32x2_t v440 = vadd_f32(v432, v430); + float32x2_t v441 = vsub_f32(v432, v430); + float32x2_t v442 = vadd_f32(v428, v434); + float32x2_t v443 = vsub_f32(v428, v434); + float32x2_t v522 = vadd_f32(v427, v437); + float32x2_t v523 = vsub_f32(v427, v437); + float32x2_t v524 = vadd_f32(v433, v431); + float32x2_t v525 = vsub_f32(v433, v431); + float32x2_t v526 = vadd_f32(v429, v435); + float32x2_t v527 = vsub_f32(v429, v435); + float32x2_t v444 = vadd_f32(v438, v440); + float32x2_t v447 = vsub_f32(v438, v440); + float32x2_t v448 = vsub_f32(v440, v442); + float32x2_t v449 = vsub_f32(v442, v438); + float32x2_t v450 = vadd_f32(v439, v441); + float32x2_t v452 = vsub_f32(v439, v441); + float32x2_t v453 = vsub_f32(v441, v443); + float32x2_t v454 = vsub_f32(v443, v439); + float32x2_t v528 = vadd_f32(v522, v524); + float32x2_t v531 = vsub_f32(v522, v524); + float32x2_t v532 = vsub_f32(v524, v526); + float32x2_t v533 = vsub_f32(v526, v522); + float32x2_t v534 = vadd_f32(v523, v525); + float32x2_t v536 = vsub_f32(v523, v525); + float32x2_t v537 = vsub_f32(v525, v527); + float32x2_t v538 = vsub_f32(v527, v523); + float32x2_t v445 = vadd_f32(v444, v442); + float32x2_t v451 = vadd_f32(v450, v443); + float32x2_t v466 = vmul_f32(v447, v549); + float32x2_t v470 = vmul_f32(v448, v553); + float32x2_t v474 = vmul_f32(v449, v557); + float32x2_t v487 = vrev64_f32(v452); + float32x2_t v494 = vrev64_f32(v453); + float32x2_t v501 = vrev64_f32(v454); + float32x2_t v529 = vadd_f32(v528, v526); + float32x2_t v535 = vadd_f32(v534, v527); + float32x2_t v550 = vmul_f32(v531, v549); + float32x2_t v554 = vmul_f32(v532, v553); + float32x2_t v558 = vmul_f32(v533, v557); + float32x2_t v571 = vrev64_f32(v536); + float32x2_t v578 = vrev64_f32(v537); + float32x2_t v585 = vrev64_f32(v538); + float32x2_t v446 = vadd_f32(v445, v424); + float32x2_t v462 = vmul_f32(v445, v545); + float32x2_t v480 = vrev64_f32(v451); + float32x2_t v488 = vmul_f32(v487, v570); + float32x2_t v495 = vmul_f32(v494, v577); + float32x2_t v502 = vmul_f32(v501, v584); + float32x2_t v530 = vadd_f32(v529, v425); + float32x2_t v546 = vmul_f32(v529, v545); + float32x2_t v564 = vrev64_f32(v535); + float32x2_t v572 = vmul_f32(v571, v570); + float32x2_t v579 = vmul_f32(v578, v577); + float32x2_t v586 = vmul_f32(v585, v584); + float32x2_t v481 = vmul_f32(v480, v563); + float32x2_t v503 = vadd_f32(v446, v462); + float32x2_t v565 = vmul_f32(v564, v563); + float32x2_t v587 = vadd_f32(v530, v546); + int16x4_t v608 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v446, 15), (int32x2_t){0, 0})); + int16x4_t v614 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v530, 15), (int32x2_t){0, 0})); + float32x2_t v504 = vadd_f32(v503, v466); + float32x2_t v506 = vsub_f32(v503, v466); + float32x2_t v508 = vsub_f32(v503, v470); + float32x2_t v510 = vadd_f32(v481, v488); + float32x2_t v512 = vsub_f32(v481, v488); + float32x2_t v514 = vsub_f32(v481, v495); + float32x2_t v588 = vadd_f32(v587, v550); + float32x2_t v590 = vsub_f32(v587, v550); + float32x2_t v592 = vsub_f32(v587, v554); + float32x2_t v594 = vadd_f32(v565, v572); + float32x2_t v596 = vsub_f32(v565, v572); + float32x2_t v598 = vsub_f32(v565, v579); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v608), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v614), 0); + float32x2_t v505 = vadd_f32(v504, v470); + float32x2_t v507 = vsub_f32(v506, v474); + float32x2_t v509 = vadd_f32(v508, v474); + float32x2_t v511 = vadd_f32(v510, v495); + float32x2_t v513 = vsub_f32(v512, v502); + float32x2_t v515 = vadd_f32(v514, v502); + float32x2_t v589 = vadd_f32(v588, v554); + float32x2_t v591 = vsub_f32(v590, v558); + float32x2_t v593 = vadd_f32(v592, v558); + float32x2_t v595 = vadd_f32(v594, v579); + float32x2_t v597 = vsub_f32(v596, v586); + float32x2_t v599 = vadd_f32(v598, v586); + float32x2_t v516 = vadd_f32(v505, v511); + float32x2_t v517 = vsub_f32(v505, v511); + float32x2_t v518 = vadd_f32(v507, v513); + float32x2_t v519 = vsub_f32(v507, v513); + float32x2_t v520 = vadd_f32(v509, v515); + float32x2_t v521 = vsub_f32(v509, v515); + float32x2_t v600 = vadd_f32(v589, v595); + float32x2_t v601 = vsub_f32(v589, v595); + float32x2_t v602 = vadd_f32(v591, v597); + float32x2_t v603 = vsub_f32(v591, v597); + float32x2_t v604 = vadd_f32(v593, v599); + float32x2_t v605 = vsub_f32(v593, v599); + int16x4_t v620 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v517, 15), (int32x2_t){0, 0})); + int16x4_t v626 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v601, 15), (int32x2_t){0, 0})); + int16x4_t v632 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v519, 15), (int32x2_t){0, 0})); + int16x4_t v638 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v603, 15), (int32x2_t){0, 0})); + int16x4_t v644 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v520, 15), (int32x2_t){0, 0})); + int16x4_t v650 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v604, 15), (int32x2_t){0, 0})); + int16x4_t v656 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v521, 15), (int32x2_t){0, 0})); + int16x4_t v662 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v605, 15), (int32x2_t){0, 0})); + int16x4_t v668 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v518, 15), (int32x2_t){0, 0})); + int16x4_t v674 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v602, 15), (int32x2_t){0, 0})); + int16x4_t v680 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v516, 15), (int32x2_t){0, 0})); + int16x4_t v686 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v600, 15), (int32x2_t){0, 0})); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v620), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v626), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v632), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v638), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v644), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v650), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v656), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v662), 0); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v668), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v674), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v680), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v686), 0); + v5 += 1 * idist; + v6 += 1 * odist; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs14(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + int64_t v3 = odist; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * v3; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + int64_t v13 = j; + float v424 = -1.1666666666666665e+00F; + float v429 = 7.9015646852540022e-01F; + float v434 = 5.5854267289647742e-02F; + float v439 = 7.3430220123575241e-01F; + float v444 = -4.4095855184409838e-01F; + float v451 = -3.4087293062393137e-01F; + float v458 = 5.3396936033772524e-01F; + float v465 = -8.7484229096165667e-01F; + const float32x2_t *v680 = &v5[v0]; + int32_t *v783 = &v6[v2]; + int64_t v19 = v0 * 7; + int64_t v34 = v10 * 6; + int64_t v40 = v0 * 2; + int64_t v54 = v0 * 9; + int64_t v76 = v10 * 8; + int64_t v82 = v0 * 4; + int64_t v96 = v0 * 11; + int64_t v111 = v10 * 3; + int64_t v118 = v10 * 10; + int64_t v124 = v0 * 6; + int64_t v138 = v0 * 13; + int64_t v153 = v10 * 5; + int64_t v160 = v10 * 12; + int64_t v166 = v0 * 8; + int64_t v195 = v10 * 7; + int64_t v208 = v0 * 10; + int64_t v222 = v0 * 3; + int64_t v237 = v10 * 9; + int64_t v244 = v10 * 2; + int64_t v250 = v0 * 12; + int64_t v264 = v0 * 5; + int64_t v279 = v10 * 11; + int64_t v286 = v10 * 4; + int64_t v287 = v13 * 13; + float v447 = v4 * v444; + float v454 = v4 * v451; + float v461 = v4 * v458; + float v468 = v4 * v465; + int64_t v499 = v2 * 7; + int64_t v507 = v2 * 8; + int64_t v523 = v2 * 2; + int64_t v531 = v2 * 9; + int64_t v539 = v2 * 10; + int64_t v547 = v2 * 3; + int64_t v555 = v2 * 4; + int64_t v563 = v2 * 11; + int64_t v571 = v2 * 12; + int64_t v579 = v2 * 5; + int64_t v587 = v2 * 6; + int64_t v595 = v2 * 13; + const float32x2_t *v728 = &v5[0]; + svint64_t v729 = svindex_s64(0, v1); + svfloat32_t v741 = svdup_n_f32(v424); + svfloat32_t v742 = svdup_n_f32(v429); + svfloat32_t v743 = svdup_n_f32(v434); + svfloat32_t v744 = svdup_n_f32(v439); + int32_t *v756 = &v6[0]; + svint64_t v874 = svindex_s64(0, v3); + int64_t v36 = v34 + v287; + int64_t v71 = v10 + v287; + int64_t v78 = v76 + v287; + int64_t v113 = v111 + v287; + int64_t v120 = v118 + v287; + int64_t v155 = v153 + v287; + int64_t v162 = v160 + v287; + int64_t v197 = v195 + v287; + svfloat32_t v205 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v287])); + int64_t v239 = v237 + v287; + int64_t v246 = v244 + v287; + int64_t v281 = v279 + v287; + int64_t v288 = v286 + v287; + const float32x2_t *v608 = &v5[v19]; + const float32x2_t *v617 = &v5[v40]; + const float32x2_t *v626 = &v5[v54]; + const float32x2_t *v635 = &v5[v82]; + const float32x2_t *v644 = &v5[v96]; + const float32x2_t *v653 = &v5[v124]; + const float32x2_t *v662 = &v5[v138]; + const float32x2_t *v671 = &v5[v166]; + svfloat32_t v682 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v680), v729)); + const float32x2_t *v691 = &v5[v208]; + const float32x2_t *v700 = &v5[v222]; + const float32x2_t *v709 = &v5[v250]; + const float32x2_t *v718 = &v5[v264]; + svfloat32_t v730 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v728), v729)); + svfloat32_t v745 = svdup_n_f32(v447); + svfloat32_t v746 = svdup_n_f32(v454); + svfloat32_t v747 = svdup_n_f32(v461); + svfloat32_t v748 = svdup_n_f32(v468); + int32_t *v765 = &v6[v499]; + int32_t *v774 = &v6[v507]; + int32_t *v792 = &v6[v523]; + int32_t *v801 = &v6[v531]; + int32_t *v810 = &v6[v539]; + int32_t *v819 = &v6[v547]; + int32_t *v828 = &v6[v555]; + int32_t *v837 = &v6[v563]; + int32_t *v846 = &v6[v571]; + int32_t *v855 = &v6[v579]; + int32_t *v864 = &v6[v587]; + int32_t *v873 = &v6[v595]; + svfloat32_t v37 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v36])); + svfloat32_t v72 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v71])); + svfloat32_t v79 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v78])); + svfloat32_t v114 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v113])); + svfloat32_t v121 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v120])); + svfloat32_t v156 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v155])); + svfloat32_t v163 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v162])); + svfloat32_t v198 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v197])); + svfloat32_t zero206 = svdup_n_f32(0); + svfloat32_t v206 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v682, v205, 0), + v682, v205, 90); + svfloat32_t v240 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v239])); + svfloat32_t v247 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v246])); + svfloat32_t v282 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v281])); + svfloat32_t v289 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v288])); + svfloat32_t v610 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v608), v729)); + svfloat32_t v619 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v617), v729)); + svfloat32_t v628 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v626), v729)); + svfloat32_t v637 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v635), v729)); + svfloat32_t v646 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v644), v729)); + svfloat32_t v655 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v653), v729)); + svfloat32_t v664 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v662), v729)); + svfloat32_t v673 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v671), v729)); + svfloat32_t v693 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v691), v729)); + svfloat32_t v702 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v700), v729)); + svfloat32_t v711 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v709), v729)); + svfloat32_t v720 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v718), v729)); + svfloat32_t zero38 = svdup_n_f32(0); + svfloat32_t v38 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v610, v37, 0), + v610, v37, 90); + svfloat32_t zero73 = svdup_n_f32(0); + svfloat32_t v73 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v619, v72, 0), + v619, v72, 90); + svfloat32_t zero80 = svdup_n_f32(0); + svfloat32_t v80 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v628, v79, 0), + v628, v79, 90); + svfloat32_t zero115 = svdup_n_f32(0); + svfloat32_t v115 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero115, v637, v114, 0), + v637, v114, 90); + svfloat32_t zero122 = svdup_n_f32(0); + svfloat32_t v122 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v646, v121, 0), + v646, v121, 90); + svfloat32_t zero157 = svdup_n_f32(0); + svfloat32_t v157 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v655, v156, 0), + v655, v156, 90); + svfloat32_t zero164 = svdup_n_f32(0); + svfloat32_t v164 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero164, v664, v163, 0), + v664, v163, 90); + svfloat32_t zero199 = svdup_n_f32(0); + svfloat32_t v199 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero199, v673, v198, 0), + v673, v198, 90); + svfloat32_t zero241 = svdup_n_f32(0); + svfloat32_t v241 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero241, v693, v240, 0), + v693, v240, 90); + svfloat32_t zero248 = svdup_n_f32(0); + svfloat32_t v248 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero248, v702, v247, 0), + v702, v247, 90); + svfloat32_t zero283 = svdup_n_f32(0); + svfloat32_t v283 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero283, v711, v282, 0), + v711, v282, 90); + svfloat32_t zero290 = svdup_n_f32(0); + svfloat32_t v290 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero290, v720, v289, 0), + v720, v289, 90); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v730, v38); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v730, v38); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v306 = svadd_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v308 = svadd_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v300, v310); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v300, v310); + svfloat32_t v314 = svadd_f32_x(svptrue_b32(), v306, v304); + svfloat32_t v315 = svsub_f32_x(svptrue_b32(), v306, v304); + svfloat32_t v316 = svadd_f32_x(svptrue_b32(), v302, v308); + svfloat32_t v317 = svsub_f32_x(svptrue_b32(), v302, v308); + svfloat32_t v401 = svadd_f32_x(svptrue_b32(), v301, v311); + svfloat32_t v402 = svsub_f32_x(svptrue_b32(), v301, v311); + svfloat32_t v403 = svadd_f32_x(svptrue_b32(), v307, v305); + svfloat32_t v404 = svsub_f32_x(svptrue_b32(), v307, v305); + svfloat32_t v405 = svadd_f32_x(svptrue_b32(), v303, v309); + svfloat32_t v406 = svsub_f32_x(svptrue_b32(), v303, v309); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v312, v314); + svfloat32_t v321 = svsub_f32_x(svptrue_b32(), v312, v314); + svfloat32_t v322 = svsub_f32_x(svptrue_b32(), v314, v316); + svfloat32_t v323 = svsub_f32_x(svptrue_b32(), v316, v312); + svfloat32_t v324 = svadd_f32_x(svptrue_b32(), v313, v315); + svfloat32_t v326 = svsub_f32_x(svptrue_b32(), v313, v315); + svfloat32_t v327 = svsub_f32_x(svptrue_b32(), v315, v317); + svfloat32_t v328 = svsub_f32_x(svptrue_b32(), v317, v313); + svfloat32_t v407 = svadd_f32_x(svptrue_b32(), v401, v403); + svfloat32_t v410 = svsub_f32_x(svptrue_b32(), v401, v403); + svfloat32_t v411 = svsub_f32_x(svptrue_b32(), v403, v405); + svfloat32_t v412 = svsub_f32_x(svptrue_b32(), v405, v401); + svfloat32_t v413 = svadd_f32_x(svptrue_b32(), v402, v404); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v402, v404); + svfloat32_t v416 = svsub_f32_x(svptrue_b32(), v404, v406); + svfloat32_t v417 = svsub_f32_x(svptrue_b32(), v406, v402); + svfloat32_t v319 = svadd_f32_x(svptrue_b32(), v318, v316); + svfloat32_t v325 = svadd_f32_x(svptrue_b32(), v324, v317); + svfloat32_t zero367 = svdup_n_f32(0); + svfloat32_t v367 = svcmla_f32_x(pred_full, zero367, v746, v326, 90); + svfloat32_t zero374 = svdup_n_f32(0); + svfloat32_t v374 = svcmla_f32_x(pred_full, zero374, v747, v327, 90); + svfloat32_t zero381 = svdup_n_f32(0); + svfloat32_t v381 = svcmla_f32_x(pred_full, zero381, v748, v328, 90); + svfloat32_t v408 = svadd_f32_x(svptrue_b32(), v407, v405); + svfloat32_t v414 = svadd_f32_x(svptrue_b32(), v413, v406); + svfloat32_t zero456 = svdup_n_f32(0); + svfloat32_t v456 = svcmla_f32_x(pred_full, zero456, v746, v415, 90); + svfloat32_t zero463 = svdup_n_f32(0); + svfloat32_t v463 = svcmla_f32_x(pred_full, zero463, v747, v416, 90); + svfloat32_t zero470 = svdup_n_f32(0); + svfloat32_t v470 = svcmla_f32_x(pred_full, zero470, v748, v417, 90); + svfloat32_t v320 = svadd_f32_x(svptrue_b32(), v319, v298); + svfloat32_t zero360 = svdup_n_f32(0); + svfloat32_t v360 = svcmla_f32_x(pred_full, zero360, v745, v325, 90); + svfloat32_t v409 = svadd_f32_x(svptrue_b32(), v408, v299); + svfloat32_t zero449 = svdup_n_f32(0); + svfloat32_t v449 = svcmla_f32_x(pred_full, zero449, v745, v414, 90); + svfloat32_t v382 = svmla_f32_x(pred_full, v320, v319, v741); + svfloat32_t v389 = svadd_f32_x(svptrue_b32(), v360, v367); + svfloat32_t v391 = svsub_f32_x(svptrue_b32(), v360, v367); + svfloat32_t v393 = svsub_f32_x(svptrue_b32(), v360, v374); + svfloat32_t v471 = svmla_f32_x(pred_full, v409, v408, v741); + svfloat32_t v478 = svadd_f32_x(svptrue_b32(), v449, v456); + svfloat32_t v480 = svsub_f32_x(svptrue_b32(), v449, v456); + svfloat32_t v482 = svsub_f32_x(svptrue_b32(), v449, v463); + svint16_t v492 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v320, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v500 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v409, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v383 = svmla_f32_x(pred_full, v382, v321, v742); + svfloat32_t v385 = svmls_f32_x(pred_full, v382, v321, v742); + svfloat32_t v387 = svmls_f32_x(pred_full, v382, v322, v743); + svfloat32_t v390 = svadd_f32_x(svptrue_b32(), v389, v374); + svfloat32_t v392 = svsub_f32_x(svptrue_b32(), v391, v381); + svfloat32_t v394 = svadd_f32_x(svptrue_b32(), v393, v381); + svfloat32_t v472 = svmla_f32_x(pred_full, v471, v410, v742); + svfloat32_t v474 = svmls_f32_x(pred_full, v471, v410, v742); + svfloat32_t v476 = svmls_f32_x(pred_full, v471, v411, v743); + svfloat32_t v479 = svadd_f32_x(svptrue_b32(), v478, v463); + svfloat32_t v481 = svsub_f32_x(svptrue_b32(), v480, v470); + svfloat32_t v483 = svadd_f32_x(svptrue_b32(), v482, v470); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v756), v874, + svreinterpret_u64_s16(v492)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v765), v874, + svreinterpret_u64_s16(v500)); + svfloat32_t v384 = svmla_f32_x(pred_full, v383, v322, v743); + svfloat32_t v386 = svmls_f32_x(pred_full, v385, v323, v744); + svfloat32_t v388 = svmla_f32_x(pred_full, v387, v323, v744); + svfloat32_t v473 = svmla_f32_x(pred_full, v472, v411, v743); + svfloat32_t v475 = svmls_f32_x(pred_full, v474, v412, v744); + svfloat32_t v477 = svmla_f32_x(pred_full, v476, v412, v744); + svfloat32_t v395 = svadd_f32_x(svptrue_b32(), v384, v390); + svfloat32_t v396 = svsub_f32_x(svptrue_b32(), v384, v390); + svfloat32_t v397 = svadd_f32_x(svptrue_b32(), v386, v392); + svfloat32_t v398 = svsub_f32_x(svptrue_b32(), v386, v392); + svfloat32_t v399 = svadd_f32_x(svptrue_b32(), v388, v394); + svfloat32_t v400 = svsub_f32_x(svptrue_b32(), v388, v394); + svfloat32_t v484 = svadd_f32_x(svptrue_b32(), v473, v479); + svfloat32_t v485 = svsub_f32_x(svptrue_b32(), v473, v479); + svfloat32_t v486 = svadd_f32_x(svptrue_b32(), v475, v481); + svfloat32_t v487 = svsub_f32_x(svptrue_b32(), v475, v481); + svfloat32_t v488 = svadd_f32_x(svptrue_b32(), v477, v483); + svfloat32_t v489 = svsub_f32_x(svptrue_b32(), v477, v483); + svint16_t v508 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v396, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v516 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v485, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v524 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v398, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v532 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v487, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v540 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v399, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v548 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v488, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v556 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v400, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v564 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v489, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v572 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v397, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v580 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v486, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v588 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v395, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v596 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v484, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v774), v874, + svreinterpret_u64_s16(v508)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v783), v874, + svreinterpret_u64_s16(v516)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v792), v874, + svreinterpret_u64_s16(v524)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v801), v874, + svreinterpret_u64_s16(v532)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v810), v874, + svreinterpret_u64_s16(v540)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v819), v874, + svreinterpret_u64_s16(v548)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v828), v874, + svreinterpret_u64_s16(v556)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v837), v874, + svreinterpret_u64_s16(v564)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v846), v874, + svreinterpret_u64_s16(v572)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v855), v874, + svreinterpret_u64_s16(v580)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v864), v874, + svreinterpret_u64_s16(v588)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v873), v874, + svreinterpret_u64_s16(v596)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs15(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v180 = v5[istride]; + float v431 = -1.2500000000000000e+00F; + float v435 = 5.5901699437494745e-01F; + float v438 = 1.5388417685876268e+00F; + float v439 = -1.5388417685876268e+00F; + float v445 = 5.8778525229247325e-01F; + float v446 = -5.8778525229247325e-01F; + float v452 = 3.6327126400268028e-01F; + float v453 = -3.6327126400268028e-01F; + float v477 = -1.4999999999999998e+00F; + float v481 = 1.8749999999999998e+00F; + float v485 = -8.3852549156242107e-01F; + float v488 = -2.3082626528814396e+00F; + float v489 = 2.3082626528814396e+00F; + float v495 = -8.8167787843870971e-01F; + float v496 = 8.8167787843870971e-01F; + float v502 = -5.4490689600402031e-01F; + float v503 = 5.4490689600402031e-01F; + float v526 = 8.6602540378443871e-01F; + float v527 = -8.6602540378443871e-01F; + float v533 = -1.0825317547305484e+00F; + float v534 = 1.0825317547305484e+00F; + float v540 = 4.8412291827592718e-01F; + float v541 = -4.8412291827592718e-01F; + float32x2_t v543 = (float32x2_t){v4, v4}; + float v548 = -1.3326760640014592e+00F; + float v552 = -5.0903696045512736e-01F; + float v556 = -3.1460214309120460e-01F; + float32x2_t v212 = vtrn1_f32(v180, v180); + float32x2_t v213 = vtrn2_f32(v180, v180); + float32x2_t v404 = v5[0]; + float32x2_t v432 = (float32x2_t){v431, v431}; + float32x2_t v436 = (float32x2_t){v435, v435}; + float32x2_t v440 = (float32x2_t){v438, v439}; + float32x2_t v447 = (float32x2_t){v445, v446}; + float32x2_t v454 = (float32x2_t){v452, v453}; + float32x2_t v478 = (float32x2_t){v477, v477}; + float32x2_t v482 = (float32x2_t){v481, v481}; + float32x2_t v486 = (float32x2_t){v485, v485}; + float32x2_t v490 = (float32x2_t){v488, v489}; + float32x2_t v497 = (float32x2_t){v495, v496}; + float32x2_t v504 = (float32x2_t){v502, v503}; + float32x2_t v528 = (float32x2_t){v526, v527}; + float32x2_t v535 = (float32x2_t){v533, v534}; + float32x2_t v542 = (float32x2_t){v540, v541}; + float32x2_t v549 = (float32x2_t){v548, v548}; + float32x2_t v553 = (float32x2_t){v552, v552}; + float32x2_t v557 = (float32x2_t){v556, v556}; + float32x2_t v20 = v5[istride * 5]; + float32x2_t v38 = v5[istride * 10]; + int64_t v55 = 8 + j * 28; + int64_t v68 = 18 + j * 28; + float32x2_t v82 = v5[istride * 8]; + float32x2_t v100 = v5[istride * 13]; + int64_t v117 = 14 + j * 28; + int64_t v130 = 24 + j * 28; + float32x2_t v144 = v5[istride * 3]; + int64_t v148 = 4 + j * 28; + float32x2_t v162 = v5[istride * 11]; + int64_t v197 = 20 + j * 28; + float32x2_t v211 = v7[j * 28]; + int64_t v215 = j * 28 + 1; + float32x2_t v224 = v5[istride * 6]; + int64_t v228 = 10 + j * 28; + float32x2_t v242 = v5[istride * 14]; + float32x2_t v260 = v5[istride * 4]; + int64_t v277 = 26 + j * 28; + int64_t v290 = 6 + j * 28; + float32x2_t v304 = v5[istride * 9]; + int64_t v308 = 16 + j * 28; + float32x2_t v322 = v5[istride * 2]; + float32x2_t v340 = v5[istride * 7]; + int64_t v357 = 2 + j * 28; + int64_t v370 = 12 + j * 28; + float32x2_t v384 = v5[istride * 12]; + int64_t v388 = 22 + j * 28; + float32x2_t v442 = vmul_f32(v543, v440); + float32x2_t v449 = vmul_f32(v543, v447); + float32x2_t v456 = vmul_f32(v543, v454); + float32x2_t v492 = vmul_f32(v543, v490); + float32x2_t v499 = vmul_f32(v543, v497); + float32x2_t v506 = vmul_f32(v543, v504); + float32x2_t v530 = vmul_f32(v543, v528); + float32x2_t v537 = vmul_f32(v543, v535); + float32x2_t v544 = vmul_f32(v543, v542); + float32x2_t v56 = v7[v55]; + float32x2_t v57 = vtrn1_f32(v20, v20); + float32x2_t v58 = vtrn2_f32(v20, v20); + int64_t v60 = v55 + 1; + float32x2_t v69 = v7[v68]; + float32x2_t v70 = vtrn1_f32(v38, v38); + float32x2_t v71 = vtrn2_f32(v38, v38); + int64_t v73 = v68 + 1; + float32x2_t v118 = v7[v117]; + float32x2_t v119 = vtrn1_f32(v82, v82); + float32x2_t v120 = vtrn2_f32(v82, v82); + int64_t v122 = v117 + 1; + float32x2_t v131 = v7[v130]; + float32x2_t v132 = vtrn1_f32(v100, v100); + float32x2_t v133 = vtrn2_f32(v100, v100); + int64_t v135 = v130 + 1; + float32x2_t v149 = v7[v148]; + float32x2_t v150 = vtrn1_f32(v144, v144); + float32x2_t v151 = vtrn2_f32(v144, v144); + int64_t v153 = v148 + 1; + float32x2_t v198 = v7[v197]; + float32x2_t v199 = vtrn1_f32(v162, v162); + float32x2_t v200 = vtrn2_f32(v162, v162); + int64_t v202 = v197 + 1; + float32x2_t v216 = v7[v215]; + float32x2_t v217 = vmul_f32(v212, v211); + float32x2_t v229 = v7[v228]; + float32x2_t v230 = vtrn1_f32(v224, v224); + float32x2_t v231 = vtrn2_f32(v224, v224); + int64_t v233 = v228 + 1; + float32x2_t v278 = v7[v277]; + float32x2_t v279 = vtrn1_f32(v242, v242); + float32x2_t v280 = vtrn2_f32(v242, v242); + int64_t v282 = v277 + 1; + float32x2_t v291 = v7[v290]; + float32x2_t v292 = vtrn1_f32(v260, v260); + float32x2_t v293 = vtrn2_f32(v260, v260); + int64_t v295 = v290 + 1; + float32x2_t v309 = v7[v308]; + float32x2_t v310 = vtrn1_f32(v304, v304); + float32x2_t v311 = vtrn2_f32(v304, v304); + int64_t v313 = v308 + 1; + float32x2_t v358 = v7[v357]; + float32x2_t v359 = vtrn1_f32(v322, v322); + float32x2_t v360 = vtrn2_f32(v322, v322); + int64_t v362 = v357 + 1; + float32x2_t v371 = v7[v370]; + float32x2_t v372 = vtrn1_f32(v340, v340); + float32x2_t v373 = vtrn2_f32(v340, v340); + int64_t v375 = v370 + 1; + float32x2_t v389 = v7[v388]; + float32x2_t v390 = vtrn1_f32(v384, v384); + float32x2_t v391 = vtrn2_f32(v384, v384); + int64_t v393 = v388 + 1; + float32x2_t v61 = v7[v60]; + float32x2_t v62 = vmul_f32(v57, v56); + float32x2_t v74 = v7[v73]; + float32x2_t v75 = vmul_f32(v70, v69); + float32x2_t v123 = v7[v122]; + float32x2_t v124 = vmul_f32(v119, v118); + float32x2_t v136 = v7[v135]; + float32x2_t v137 = vmul_f32(v132, v131); + float32x2_t v154 = v7[v153]; + float32x2_t v155 = vmul_f32(v150, v149); + float32x2_t v203 = v7[v202]; + float32x2_t v204 = vmul_f32(v199, v198); + float32x2_t v234 = v7[v233]; + float32x2_t v235 = vmul_f32(v230, v229); + float32x2_t v283 = v7[v282]; + float32x2_t v284 = vmul_f32(v279, v278); + float32x2_t v296 = v7[v295]; + float32x2_t v297 = vmul_f32(v292, v291); + float32x2_t v314 = v7[v313]; + float32x2_t v315 = vmul_f32(v310, v309); + float32x2_t v363 = v7[v362]; + float32x2_t v364 = vmul_f32(v359, v358); + float32x2_t v376 = v7[v375]; + float32x2_t v377 = vmul_f32(v372, v371); + float32x2_t v394 = v7[v393]; + float32x2_t v395 = vmul_f32(v390, v389); + float32x2_t v219 = vfma_f32(v217, v213, v216); + float32x2_t v64 = vfma_f32(v62, v58, v61); + float32x2_t v77 = vfma_f32(v75, v71, v74); + float32x2_t v126 = vfma_f32(v124, v120, v123); + float32x2_t v139 = vfma_f32(v137, v133, v136); + float32x2_t v157 = vfma_f32(v155, v151, v154); + float32x2_t v206 = vfma_f32(v204, v200, v203); + float32x2_t v237 = vfma_f32(v235, v231, v234); + float32x2_t v286 = vfma_f32(v284, v280, v283); + float32x2_t v299 = vfma_f32(v297, v293, v296); + float32x2_t v317 = vfma_f32(v315, v311, v314); + float32x2_t v366 = vfma_f32(v364, v360, v363); + float32x2_t v379 = vfma_f32(v377, v373, v376); + float32x2_t v397 = vfma_f32(v395, v391, v394); + float32x2_t v398 = vadd_f32(v64, v77); + float32x2_t v399 = vsub_f32(v64, v77); + float32x2_t v406 = vadd_f32(v126, v139); + float32x2_t v407 = vsub_f32(v126, v139); + float32x2_t v409 = vadd_f32(v206, v219); + float32x2_t v410 = vsub_f32(v206, v219); + float32x2_t v412 = vadd_f32(v286, v299); + float32x2_t v413 = vsub_f32(v286, v299); + float32x2_t v415 = vadd_f32(v366, v379); + float32x2_t v416 = vsub_f32(v366, v379); + float32x2_t v405 = vadd_f32(v398, v404); + float32x2_t v408 = vadd_f32(v406, v157); + float32x2_t v411 = vadd_f32(v409, v237); + float32x2_t v414 = vadd_f32(v412, v317); + float32x2_t v417 = vadd_f32(v415, v397); + float32x2_t v468 = vadd_f32(v406, v415); + float32x2_t v469 = vsub_f32(v406, v415); + float32x2_t v470 = vadd_f32(v412, v409); + float32x2_t v471 = vsub_f32(v412, v409); + float32x2_t v518 = vadd_f32(v407, v416); + float32x2_t v519 = vsub_f32(v407, v416); + float32x2_t v520 = vadd_f32(v413, v410); + float32x2_t v521 = vsub_f32(v413, v410); + float32x2_t v418 = vadd_f32(v408, v417); + float32x2_t v419 = vsub_f32(v408, v417); + float32x2_t v420 = vadd_f32(v414, v411); + float32x2_t v421 = vsub_f32(v414, v411); + float32x2_t v472 = vadd_f32(v468, v470); + float32x2_t v473 = vsub_f32(v468, v470); + float32x2_t v474 = vadd_f32(v469, v471); + float32x2_t v493 = vrev64_f32(v469); + float32x2_t v507 = vrev64_f32(v471); + float32x2_t v522 = vadd_f32(v518, v520); + float32x2_t v523 = vsub_f32(v518, v520); + float32x2_t v524 = vadd_f32(v519, v521); + float32x2_t v550 = vmul_f32(v519, v549); + float32x2_t v558 = vmul_f32(v521, v557); + float32x2_t v422 = vadd_f32(v418, v420); + float32x2_t v423 = vsub_f32(v418, v420); + float32x2_t v424 = vadd_f32(v419, v421); + float32x2_t v443 = vrev64_f32(v419); + float32x2_t v457 = vrev64_f32(v421); + float32x2_t v475 = vadd_f32(v472, v398); + float32x2_t v483 = vmul_f32(v472, v482); + float32x2_t v487 = vmul_f32(v473, v486); + float32x2_t v494 = vmul_f32(v493, v492); + float32x2_t v500 = vrev64_f32(v474); + float32x2_t v508 = vmul_f32(v507, v506); + float32x2_t v525 = vadd_f32(v522, v399); + float32x2_t v538 = vrev64_f32(v522); + float32x2_t v545 = vrev64_f32(v523); + float32x2_t v554 = vmul_f32(v524, v553); + float32x2_t v425 = vadd_f32(v422, v405); + float32x2_t v433 = vmul_f32(v422, v432); + float32x2_t v437 = vmul_f32(v423, v436); + float32x2_t v444 = vmul_f32(v443, v442); + float32x2_t v450 = vrev64_f32(v424); + float32x2_t v458 = vmul_f32(v457, v456); + float32x2_t v479 = vmul_f32(v475, v478); + float32x2_t v501 = vmul_f32(v500, v499); + float32x2_t v531 = vrev64_f32(v525); + float32x2_t v539 = vmul_f32(v538, v537); + float32x2_t v546 = vmul_f32(v545, v544); + float32x2_t v562 = vsub_f32(v550, v554); + float32x2_t v563 = vadd_f32(v554, v558); + float32x2_t v451 = vmul_f32(v450, v449); + float32x2_t v459 = vadd_f32(v425, v433); + float32x2_t v509 = vadd_f32(v479, v483); + float32x2_t v512 = vsub_f32(v494, v501); + float32x2_t v513 = vadd_f32(v501, v508); + float32x2_t v532 = vmul_f32(v531, v530); + float32x2_t v568 = vadd_f32(v425, v479); + int16x4_t v573 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v425, 15), (int32x2_t){0, 0})); + float32x2_t v460 = vadd_f32(v459, v437); + float32x2_t v461 = vsub_f32(v459, v437); + float32x2_t v462 = vsub_f32(v444, v451); + float32x2_t v463 = vadd_f32(v451, v458); + float32x2_t v510 = vadd_f32(v509, v487); + float32x2_t v511 = vsub_f32(v509, v487); + float32x2_t v559 = vadd_f32(v532, v539); + float32x2_t v569 = vadd_f32(v568, v532); + float32x2_t v570 = vsub_f32(v568, v532); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v573), 0); + float32x2_t v464 = vadd_f32(v460, v462); + float32x2_t v465 = vsub_f32(v460, v462); + float32x2_t v466 = vadd_f32(v461, v463); + float32x2_t v467 = vsub_f32(v461, v463); + float32x2_t v514 = vadd_f32(v510, v512); + float32x2_t v515 = vsub_f32(v510, v512); + float32x2_t v516 = vadd_f32(v511, v513); + float32x2_t v517 = vsub_f32(v511, v513); + float32x2_t v560 = vadd_f32(v559, v546); + float32x2_t v561 = vsub_f32(v559, v546); + int16x4_t v579 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v570, 15), (int32x2_t){0, 0})); + int16x4_t v585 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v569, 15), (int32x2_t){0, 0})); + float32x2_t v564 = vadd_f32(v560, v562); + float32x2_t v565 = vsub_f32(v560, v562); + float32x2_t v566 = vadd_f32(v561, v563); + float32x2_t v567 = vsub_f32(v561, v563); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v579), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v585), 0); + float32x2_t v589 = vadd_f32(v465, v515); + int16x4_t v594 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v465, 15), (int32x2_t){0, 0})); + float32x2_t v610 = vadd_f32(v467, v517); + int16x4_t v615 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v467, 15), (int32x2_t){0, 0})); + float32x2_t v631 = vadd_f32(v466, v516); + int16x4_t v636 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v466, 15), (int32x2_t){0, 0})); + float32x2_t v652 = vadd_f32(v464, v514); + int16x4_t v657 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v464, 15), (int32x2_t){0, 0})); + float32x2_t v590 = vadd_f32(v589, v565); + float32x2_t v591 = vsub_f32(v589, v565); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v594), 0); + float32x2_t v611 = vadd_f32(v610, v567); + float32x2_t v612 = vsub_f32(v610, v567); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v615), 0); + float32x2_t v632 = vadd_f32(v631, v566); + float32x2_t v633 = vsub_f32(v631, v566); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v636), 0); + float32x2_t v653 = vadd_f32(v652, v564); + float32x2_t v654 = vsub_f32(v652, v564); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v657), 0); + int16x4_t v600 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v591, 15), (int32x2_t){0, 0})); + int16x4_t v606 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v590, 15), (int32x2_t){0, 0})); + int16x4_t v621 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v612, 15), (int32x2_t){0, 0})); + int16x4_t v627 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v611, 15), (int32x2_t){0, 0})); + int16x4_t v642 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v633, 15), (int32x2_t){0, 0})); + int16x4_t v648 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v632, 15), (int32x2_t){0, 0})); + int16x4_t v663 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v654, 15), (int32x2_t){0, 0})); + int16x4_t v669 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v653, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v600), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v606), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v621), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v627), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v642), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v648), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v663), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v669), 0); + v5 += 1 * idist; + v6 += 1 * odist; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs15(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + int64_t v3 = odist; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * v3; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + int64_t v13 = j; + float v320 = -1.2500000000000000e+00F; + float v325 = 5.5901699437494745e-01F; + float v330 = -1.5388417685876268e+00F; + float v337 = -5.8778525229247325e-01F; + float v344 = -3.6327126400268028e-01F; + float v368 = -1.4999999999999998e+00F; + float v373 = 1.8749999999999998e+00F; + float v378 = -8.3852549156242107e-01F; + float v383 = 2.3082626528814396e+00F; + float v390 = 8.8167787843870971e-01F; + float v397 = 5.4490689600402031e-01F; + float v421 = -8.6602540378443871e-01F; + float v428 = 1.0825317547305484e+00F; + float v435 = -4.8412291827592718e-01F; + float v442 = -1.3326760640014592e+00F; + float v447 = -5.0903696045512736e-01F; + float v452 = -3.1460214309120460e-01F; + const float32x2_t *v660 = &v5[v0]; + int32_t *v799 = &v6[v2]; + int64_t v19 = v0 * 5; + int64_t v33 = v0 * 10; + int64_t v48 = v10 * 4; + int64_t v55 = v10 * 9; + int64_t v61 = v0 * 8; + int64_t v75 = v0 * 13; + int64_t v90 = v10 * 7; + int64_t v97 = v10 * 12; + int64_t v103 = v0 * 3; + int64_t v111 = v10 * 2; + int64_t v117 = v0 * 11; + int64_t v146 = v10 * 10; + int64_t v159 = v0 * 6; + int64_t v167 = v10 * 5; + int64_t v173 = v0 * 14; + int64_t v187 = v0 * 4; + int64_t v202 = v10 * 13; + int64_t v209 = v10 * 3; + int64_t v215 = v0 * 9; + int64_t v223 = v10 * 8; + int64_t v229 = v0 * 2; + int64_t v243 = v0 * 7; + int64_t v265 = v10 * 6; + int64_t v271 = v0 * 12; + int64_t v279 = v10 * 11; + int64_t v280 = v13 * 14; + float v333 = v4 * v330; + float v340 = v4 * v337; + float v347 = v4 * v344; + float v386 = v4 * v383; + float v393 = v4 * v390; + float v400 = v4 * v397; + float v424 = v4 * v421; + float v431 = v4 * v428; + float v438 = v4 * v435; + int64_t v477 = v2 * 10; + int64_t v485 = v2 * 5; + int64_t v496 = v2 * 6; + int64_t v512 = v2 * 11; + int64_t v523 = v2 * 12; + int64_t v531 = v2 * 7; + int64_t v539 = v2 * 2; + int64_t v550 = v2 * 3; + int64_t v558 = v2 * 13; + int64_t v566 = v2 * 8; + int64_t v577 = v2 * 9; + int64_t v585 = v2 * 4; + int64_t v593 = v2 * 14; + const float32x2_t *v735 = &v5[0]; + svint64_t v736 = svindex_s64(0, v1); + svfloat32_t v739 = svdup_n_f32(v320); + svfloat32_t v740 = svdup_n_f32(v325); + svfloat32_t v744 = svdup_n_f32(v368); + svfloat32_t v745 = svdup_n_f32(v373); + svfloat32_t v746 = svdup_n_f32(v378); + svfloat32_t v753 = svdup_n_f32(v442); + svfloat32_t v754 = svdup_n_f32(v447); + svfloat32_t v755 = svdup_n_f32(v452); + int32_t *v763 = &v6[0]; + svint64_t v890 = svindex_s64(0, v3); + int64_t v50 = v48 + v280; + int64_t v57 = v55 + v280; + int64_t v92 = v90 + v280; + int64_t v99 = v97 + v280; + int64_t v113 = v111 + v280; + int64_t v148 = v146 + v280; + svfloat32_t v156 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v280])); + int64_t v169 = v167 + v280; + int64_t v204 = v202 + v280; + int64_t v211 = v209 + v280; + int64_t v225 = v223 + v280; + int64_t v260 = v10 + v280; + int64_t v267 = v265 + v280; + int64_t v281 = v279 + v280; + const float32x2_t *v606 = &v5[v19]; + const float32x2_t *v615 = &v5[v33]; + const float32x2_t *v624 = &v5[v61]; + const float32x2_t *v633 = &v5[v75]; + const float32x2_t *v642 = &v5[v103]; + const float32x2_t *v651 = &v5[v117]; + svfloat32_t v662 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v660), v736)); + const float32x2_t *v671 = &v5[v159]; + const float32x2_t *v680 = &v5[v173]; + const float32x2_t *v689 = &v5[v187]; + const float32x2_t *v698 = &v5[v215]; + const float32x2_t *v707 = &v5[v229]; + const float32x2_t *v716 = &v5[v243]; + const float32x2_t *v725 = &v5[v271]; + svfloat32_t v737 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v735), v736)); + svfloat32_t v741 = svdup_n_f32(v333); + svfloat32_t v742 = svdup_n_f32(v340); + svfloat32_t v743 = svdup_n_f32(v347); + svfloat32_t v747 = svdup_n_f32(v386); + svfloat32_t v748 = svdup_n_f32(v393); + svfloat32_t v749 = svdup_n_f32(v400); + svfloat32_t v750 = svdup_n_f32(v424); + svfloat32_t v751 = svdup_n_f32(v431); + svfloat32_t v752 = svdup_n_f32(v438); + int32_t *v772 = &v6[v477]; + int32_t *v781 = &v6[v485]; + int32_t *v790 = &v6[v496]; + int32_t *v808 = &v6[v512]; + int32_t *v817 = &v6[v523]; + int32_t *v826 = &v6[v531]; + int32_t *v835 = &v6[v539]; + int32_t *v844 = &v6[v550]; + int32_t *v853 = &v6[v558]; + int32_t *v862 = &v6[v566]; + int32_t *v871 = &v6[v577]; + int32_t *v880 = &v6[v585]; + int32_t *v889 = &v6[v593]; + svfloat32_t v51 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v50])); + svfloat32_t v58 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v57])); + svfloat32_t v93 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v92])); + svfloat32_t v100 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v99])); + svfloat32_t v114 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v113])); + svfloat32_t v149 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v148])); + svfloat32_t zero157 = svdup_n_f32(0); + svfloat32_t v157 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v662, v156, 0), + v662, v156, 90); + svfloat32_t v170 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v169])); + svfloat32_t v205 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v204])); + svfloat32_t v212 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v211])); + svfloat32_t v226 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v225])); + svfloat32_t v261 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v260])); + svfloat32_t v268 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v267])); + svfloat32_t v282 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v281])); + svfloat32_t v608 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v606), v736)); + svfloat32_t v617 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v615), v736)); + svfloat32_t v626 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v624), v736)); + svfloat32_t v635 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v633), v736)); + svfloat32_t v644 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v642), v736)); + svfloat32_t v653 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v651), v736)); + svfloat32_t v673 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v671), v736)); + svfloat32_t v682 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v680), v736)); + svfloat32_t v691 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v689), v736)); + svfloat32_t v700 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v698), v736)); + svfloat32_t v709 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v707), v736)); + svfloat32_t v718 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v716), v736)); + svfloat32_t v727 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v725), v736)); + svfloat32_t zero52 = svdup_n_f32(0); + svfloat32_t v52 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v608, v51, 0), + v608, v51, 90); + svfloat32_t zero59 = svdup_n_f32(0); + svfloat32_t v59 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v617, v58, 0), + v617, v58, 90); + svfloat32_t zero94 = svdup_n_f32(0); + svfloat32_t v94 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v626, v93, 0), + v626, v93, 90); + svfloat32_t zero101 = svdup_n_f32(0); + svfloat32_t v101 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v635, v100, 0), + v635, v100, 90); + svfloat32_t zero150 = svdup_n_f32(0); + svfloat32_t v150 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero150, v653, v149, 0), + v653, v149, 90); + svfloat32_t zero206 = svdup_n_f32(0); + svfloat32_t v206 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v682, v205, 0), + v682, v205, 90); + svfloat32_t zero213 = svdup_n_f32(0); + svfloat32_t v213 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero213, v691, v212, 0), + v691, v212, 90); + svfloat32_t zero262 = svdup_n_f32(0); + svfloat32_t v262 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero262, v709, v261, 0), + v709, v261, 90); + svfloat32_t zero269 = svdup_n_f32(0); + svfloat32_t v269 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero269, v718, v268, 0), + v718, v268, 90); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v297 = svadd_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v298 = svsub_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v303 = svadd_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v304 = svsub_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v293 = svadd_f32_x(svptrue_b32(), v284, v737); + svfloat32_t v296 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v294, v644, v114, 0), + v644, v114, 90); + svfloat32_t v299 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v297, v673, v170, 0), + v673, v170, 90); + svfloat32_t v302 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v300, v700, v226, 0), + v700, v226, 90); + svfloat32_t v305 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v303, v727, v282, 0), + v727, v282, 90); + svfloat32_t v359 = svadd_f32_x(svptrue_b32(), v294, v303); + svfloat32_t v360 = svsub_f32_x(svptrue_b32(), v294, v303); + svfloat32_t v361 = svadd_f32_x(svptrue_b32(), v300, v297); + svfloat32_t v362 = svsub_f32_x(svptrue_b32(), v300, v297); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v295, v304); + svfloat32_t v413 = svsub_f32_x(svptrue_b32(), v295, v304); + svfloat32_t v414 = svadd_f32_x(svptrue_b32(), v301, v298); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v301, v298); + svfloat32_t v306 = svadd_f32_x(svptrue_b32(), v296, v305); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v296, v305); + svfloat32_t v308 = svadd_f32_x(svptrue_b32(), v302, v299); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v302, v299); + svfloat32_t v363 = svadd_f32_x(svptrue_b32(), v359, v361); + svfloat32_t v364 = svsub_f32_x(svptrue_b32(), v359, v361); + svfloat32_t v365 = svadd_f32_x(svptrue_b32(), v360, v362); + svfloat32_t zero388 = svdup_n_f32(0); + svfloat32_t v388 = svcmla_f32_x(pred_full, zero388, v747, v360, 90); + svfloat32_t v416 = svadd_f32_x(svptrue_b32(), v412, v414); + svfloat32_t v417 = svsub_f32_x(svptrue_b32(), v412, v414); + svfloat32_t v418 = svadd_f32_x(svptrue_b32(), v413, v415); + svfloat32_t v455 = svmul_f32_x(svptrue_b32(), v415, v755); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v306, v308); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v306, v308); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v307, v309); + svfloat32_t zero335 = svdup_n_f32(0); + svfloat32_t v335 = svcmla_f32_x(pred_full, zero335, v741, v307, 90); + svfloat32_t v366 = svadd_f32_x(svptrue_b32(), v363, v284); + svfloat32_t v376 = svmul_f32_x(svptrue_b32(), v363, v745); + svfloat32_t zero395 = svdup_n_f32(0); + svfloat32_t v395 = svcmla_f32_x(pred_full, zero395, v748, v365, 90); + svfloat32_t v419 = svadd_f32_x(svptrue_b32(), v416, v285); + svfloat32_t zero440 = svdup_n_f32(0); + svfloat32_t v440 = svcmla_f32_x(pred_full, zero440, v752, v417, 90); + svfloat32_t v450 = svmul_f32_x(svptrue_b32(), v418, v754); + svfloat32_t v313 = svadd_f32_x(svptrue_b32(), v310, v293); + svfloat32_t zero342 = svdup_n_f32(0); + svfloat32_t v342 = svcmla_f32_x(pred_full, zero342, v742, v312, 90); + svfloat32_t v406 = svsub_f32_x(svptrue_b32(), v388, v395); + svfloat32_t v407 = svcmla_f32_x(pred_full, v395, v749, v362, 90); + svfloat32_t zero426 = svdup_n_f32(0); + svfloat32_t v426 = svcmla_f32_x(pred_full, zero426, v750, v419, 90); + svfloat32_t v459 = svnmls_f32_x(pred_full, v450, v413, v753); + svfloat32_t v460 = svmla_f32_x(pred_full, v455, v418, v754); + svfloat32_t v350 = svmla_f32_x(pred_full, v313, v310, v739); + svfloat32_t v353 = svsub_f32_x(svptrue_b32(), v335, v342); + svfloat32_t v354 = svcmla_f32_x(pred_full, v342, v743, v309, 90); + svfloat32_t v403 = svmla_f32_x(pred_full, v376, v366, v744); + svfloat32_t v456 = svcmla_f32_x(pred_full, v426, v751, v416, 90); + svfloat32_t v465 = svmla_f32_x(pred_full, v313, v366, v744); + svint16_t v470 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v313, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v351 = svmla_f32_x(pred_full, v350, v311, v740); + svfloat32_t v352 = svmls_f32_x(pred_full, v350, v311, v740); + svfloat32_t v404 = svmla_f32_x(pred_full, v403, v364, v746); + svfloat32_t v405 = svmls_f32_x(pred_full, v403, v364, v746); + svfloat32_t v457 = svadd_f32_x(svptrue_b32(), v456, v440); + svfloat32_t v458 = svsub_f32_x(svptrue_b32(), v456, v440); + svfloat32_t v466 = svadd_f32_x(svptrue_b32(), v465, v426); + svfloat32_t v467 = svsub_f32_x(svptrue_b32(), v465, v426); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v763), v890, + svreinterpret_u64_s16(v470)); + svfloat32_t v355 = svadd_f32_x(svptrue_b32(), v351, v353); + svfloat32_t v356 = svsub_f32_x(svptrue_b32(), v351, v353); + svfloat32_t v357 = svadd_f32_x(svptrue_b32(), v352, v354); + svfloat32_t v358 = svsub_f32_x(svptrue_b32(), v352, v354); + svfloat32_t v408 = svadd_f32_x(svptrue_b32(), v404, v406); + svfloat32_t v409 = svsub_f32_x(svptrue_b32(), v404, v406); + svfloat32_t v410 = svadd_f32_x(svptrue_b32(), v405, v407); + svfloat32_t v411 = svsub_f32_x(svptrue_b32(), v405, v407); + svfloat32_t v461 = svadd_f32_x(svptrue_b32(), v457, v459); + svfloat32_t v462 = svsub_f32_x(svptrue_b32(), v457, v459); + svfloat32_t v463 = svadd_f32_x(svptrue_b32(), v458, v460); + svfloat32_t v464 = svsub_f32_x(svptrue_b32(), v458, v460); + svint16_t v478 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v467, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v486 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v466, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v492 = svadd_f32_x(svptrue_b32(), v356, v409); + svint16_t v497 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v356, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v519 = svadd_f32_x(svptrue_b32(), v358, v411); + svint16_t v524 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v358, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v546 = svadd_f32_x(svptrue_b32(), v357, v410); + svint16_t v551 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v357, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v573 = svadd_f32_x(svptrue_b32(), v355, v408); + svint16_t v578 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v355, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v772), v890, + svreinterpret_u64_s16(v478)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v781), v890, + svreinterpret_u64_s16(v486)); + svfloat32_t v493 = svadd_f32_x(svptrue_b32(), v492, v462); + svfloat32_t v494 = svsub_f32_x(svptrue_b32(), v492, v462); + svfloat32_t v520 = svadd_f32_x(svptrue_b32(), v519, v464); + svfloat32_t v521 = svsub_f32_x(svptrue_b32(), v519, v464); + svfloat32_t v547 = svadd_f32_x(svptrue_b32(), v546, v463); + svfloat32_t v548 = svsub_f32_x(svptrue_b32(), v546, v463); + svfloat32_t v574 = svadd_f32_x(svptrue_b32(), v573, v461); + svfloat32_t v575 = svsub_f32_x(svptrue_b32(), v573, v461); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v790), v890, + svreinterpret_u64_s16(v497)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v817), v890, + svreinterpret_u64_s16(v524)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v844), v890, + svreinterpret_u64_s16(v551)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v871), v890, + svreinterpret_u64_s16(v578)); + svint16_t v505 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v494, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v513 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v493, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v532 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v521, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v540 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v520, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v559 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v548, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v567 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v547, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v586 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v575, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v594 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v574, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v799), v890, + svreinterpret_u64_s16(v505)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v808), v890, + svreinterpret_u64_s16(v513)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v826), v890, + svreinterpret_u64_s16(v532)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v835), v890, + svreinterpret_u64_s16(v540)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v853), v890, + svreinterpret_u64_s16(v559)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v862), v890, + svreinterpret_u64_s16(v567)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v880), v890, + svreinterpret_u64_s16(v586)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v889), v890, + svreinterpret_u64_s16(v594)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs16(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v237 = v5[istride]; + float v571 = 1.0000000000000000e+00F; + float v572 = -1.0000000000000000e+00F; + float v579 = -7.0710678118654746e-01F; + float v586 = 7.0710678118654757e-01F; + float v589 = 9.2387953251128674e-01F; + float v590 = -9.2387953251128674e-01F; + float v597 = 5.4119610014619690e-01F; + float v604 = -1.3065629648763766e+00F; + float32x2_t v606 = (float32x2_t){v4, v4}; + float v611 = 3.8268343236508984e-01F; + float v615 = 1.3065629648763766e+00F; + float v619 = -5.4119610014619690e-01F; + float32x2_t v274 = vtrn1_f32(v237, v237); + float32x2_t v275 = vtrn2_f32(v237, v237); + float32x2_t v485 = v5[0]; + float32x2_t v573 = (float32x2_t){v571, v572}; + float32x2_t v580 = (float32x2_t){v586, v579}; + float32x2_t v587 = (float32x2_t){v586, v586}; + float32x2_t v591 = (float32x2_t){v589, v590}; + float32x2_t v598 = (float32x2_t){v619, v597}; + float32x2_t v605 = (float32x2_t){v615, v604}; + float32x2_t v612 = (float32x2_t){v611, v611}; + float32x2_t v616 = (float32x2_t){v615, v615}; + float32x2_t v620 = (float32x2_t){v619, v619}; + float32x2_t v20 = v5[istride * 8]; + int64_t v37 = 14 + j * 30; + float32x2_t v51 = v5[istride * 4]; + float32x2_t v69 = v5[istride * 12]; + int64_t v86 = 6 + j * 30; + int64_t v99 = 22 + j * 30; + float32x2_t v113 = v5[istride * 2]; + float32x2_t v131 = v5[istride * 10]; + int64_t v148 = 2 + j * 30; + int64_t v161 = 18 + j * 30; + float32x2_t v175 = v5[istride * 6]; + float32x2_t v193 = v5[istride * 14]; + int64_t v210 = 10 + j * 30; + int64_t v223 = 26 + j * 30; + float32x2_t v255 = v5[istride * 9]; + float32x2_t v273 = v7[j * 30]; + int64_t v277 = j * 30 + 1; + int64_t v285 = 16 + j * 30; + float32x2_t v299 = v5[istride * 5]; + float32x2_t v317 = v5[istride * 13]; + int64_t v334 = 8 + j * 30; + int64_t v347 = 24 + j * 30; + float32x2_t v361 = v5[istride * 3]; + float32x2_t v379 = v5[istride * 11]; + int64_t v396 = 4 + j * 30; + int64_t v409 = 20 + j * 30; + float32x2_t v423 = v5[istride * 7]; + float32x2_t v441 = v5[istride * 15]; + int64_t v458 = 12 + j * 30; + int64_t v471 = 28 + j * 30; + float32x2_t v575 = vmul_f32(v606, v573); + float32x2_t v582 = vmul_f32(v606, v580); + float32x2_t v593 = vmul_f32(v606, v591); + float32x2_t v600 = vmul_f32(v606, v598); + float32x2_t v607 = vmul_f32(v606, v605); + float32x2_t v38 = v7[v37]; + float32x2_t v39 = vtrn1_f32(v20, v20); + float32x2_t v40 = vtrn2_f32(v20, v20); + int64_t v42 = v37 + 1; + float32x2_t v87 = v7[v86]; + float32x2_t v88 = vtrn1_f32(v51, v51); + float32x2_t v89 = vtrn2_f32(v51, v51); + int64_t v91 = v86 + 1; + float32x2_t v100 = v7[v99]; + float32x2_t v101 = vtrn1_f32(v69, v69); + float32x2_t v102 = vtrn2_f32(v69, v69); + int64_t v104 = v99 + 1; + float32x2_t v149 = v7[v148]; + float32x2_t v150 = vtrn1_f32(v113, v113); + float32x2_t v151 = vtrn2_f32(v113, v113); + int64_t v153 = v148 + 1; + float32x2_t v162 = v7[v161]; + float32x2_t v163 = vtrn1_f32(v131, v131); + float32x2_t v164 = vtrn2_f32(v131, v131); + int64_t v166 = v161 + 1; + float32x2_t v211 = v7[v210]; + float32x2_t v212 = vtrn1_f32(v175, v175); + float32x2_t v213 = vtrn2_f32(v175, v175); + int64_t v215 = v210 + 1; + float32x2_t v224 = v7[v223]; + float32x2_t v225 = vtrn1_f32(v193, v193); + float32x2_t v226 = vtrn2_f32(v193, v193); + int64_t v228 = v223 + 1; + float32x2_t v278 = v7[v277]; + float32x2_t v279 = vmul_f32(v274, v273); + float32x2_t v286 = v7[v285]; + float32x2_t v287 = vtrn1_f32(v255, v255); + float32x2_t v288 = vtrn2_f32(v255, v255); + int64_t v290 = v285 + 1; + float32x2_t v335 = v7[v334]; + float32x2_t v336 = vtrn1_f32(v299, v299); + float32x2_t v337 = vtrn2_f32(v299, v299); + int64_t v339 = v334 + 1; + float32x2_t v348 = v7[v347]; + float32x2_t v349 = vtrn1_f32(v317, v317); + float32x2_t v350 = vtrn2_f32(v317, v317); + int64_t v352 = v347 + 1; + float32x2_t v397 = v7[v396]; + float32x2_t v398 = vtrn1_f32(v361, v361); + float32x2_t v399 = vtrn2_f32(v361, v361); + int64_t v401 = v396 + 1; + float32x2_t v410 = v7[v409]; + float32x2_t v411 = vtrn1_f32(v379, v379); + float32x2_t v412 = vtrn2_f32(v379, v379); + int64_t v414 = v409 + 1; + float32x2_t v459 = v7[v458]; + float32x2_t v460 = vtrn1_f32(v423, v423); + float32x2_t v461 = vtrn2_f32(v423, v423); + int64_t v463 = v458 + 1; + float32x2_t v472 = v7[v471]; + float32x2_t v473 = vtrn1_f32(v441, v441); + float32x2_t v474 = vtrn2_f32(v441, v441); + int64_t v476 = v471 + 1; + float32x2_t v43 = v7[v42]; + float32x2_t v44 = vmul_f32(v39, v38); + float32x2_t v92 = v7[v91]; + float32x2_t v93 = vmul_f32(v88, v87); + float32x2_t v105 = v7[v104]; + float32x2_t v106 = vmul_f32(v101, v100); + float32x2_t v154 = v7[v153]; + float32x2_t v155 = vmul_f32(v150, v149); + float32x2_t v167 = v7[v166]; + float32x2_t v168 = vmul_f32(v163, v162); + float32x2_t v216 = v7[v215]; + float32x2_t v217 = vmul_f32(v212, v211); + float32x2_t v229 = v7[v228]; + float32x2_t v230 = vmul_f32(v225, v224); + float32x2_t v291 = v7[v290]; + float32x2_t v292 = vmul_f32(v287, v286); + float32x2_t v340 = v7[v339]; + float32x2_t v341 = vmul_f32(v336, v335); + float32x2_t v353 = v7[v352]; + float32x2_t v354 = vmul_f32(v349, v348); + float32x2_t v402 = v7[v401]; + float32x2_t v403 = vmul_f32(v398, v397); + float32x2_t v415 = v7[v414]; + float32x2_t v416 = vmul_f32(v411, v410); + float32x2_t v464 = v7[v463]; + float32x2_t v465 = vmul_f32(v460, v459); + float32x2_t v477 = v7[v476]; + float32x2_t v478 = vmul_f32(v473, v472); + float32x2_t v281 = vfma_f32(v279, v275, v278); + float32x2_t v46 = vfma_f32(v44, v40, v43); + float32x2_t v95 = vfma_f32(v93, v89, v92); + float32x2_t v108 = vfma_f32(v106, v102, v105); + float32x2_t v157 = vfma_f32(v155, v151, v154); + float32x2_t v170 = vfma_f32(v168, v164, v167); + float32x2_t v219 = vfma_f32(v217, v213, v216); + float32x2_t v232 = vfma_f32(v230, v226, v229); + float32x2_t v294 = vfma_f32(v292, v288, v291); + float32x2_t v343 = vfma_f32(v341, v337, v340); + float32x2_t v356 = vfma_f32(v354, v350, v353); + float32x2_t v405 = vfma_f32(v403, v399, v402); + float32x2_t v418 = vfma_f32(v416, v412, v415); + float32x2_t v467 = vfma_f32(v465, v461, v464); + float32x2_t v480 = vfma_f32(v478, v474, v477); + float32x2_t v486 = vadd_f32(v485, v46); + float32x2_t v487 = vsub_f32(v485, v46); + float32x2_t v488 = vadd_f32(v95, v108); + float32x2_t v489 = vsub_f32(v95, v108); + float32x2_t v490 = vadd_f32(v157, v170); + float32x2_t v491 = vsub_f32(v157, v170); + float32x2_t v492 = vadd_f32(v219, v232); + float32x2_t v493 = vsub_f32(v219, v232); + float32x2_t v494 = vadd_f32(v281, v294); + float32x2_t v495 = vsub_f32(v281, v294); + float32x2_t v496 = vadd_f32(v343, v356); + float32x2_t v497 = vsub_f32(v343, v356); + float32x2_t v498 = vadd_f32(v405, v418); + float32x2_t v499 = vsub_f32(v405, v418); + float32x2_t v500 = vadd_f32(v467, v480); + float32x2_t v501 = vsub_f32(v467, v480); + float32x2_t v502 = vadd_f32(v486, v488); + float32x2_t v503 = vsub_f32(v486, v488); + float32x2_t v504 = vadd_f32(v490, v492); + float32x2_t v505 = vsub_f32(v490, v492); + float32x2_t v506 = vadd_f32(v494, v496); + float32x2_t v507 = vsub_f32(v494, v496); + float32x2_t v508 = vadd_f32(v498, v500); + float32x2_t v509 = vsub_f32(v498, v500); + float32x2_t v518 = vadd_f32(v491, v493); + float32x2_t v519 = vsub_f32(v491, v493); + float32x2_t v520 = vadd_f32(v495, v501); + float32x2_t v521 = vsub_f32(v495, v501); + float32x2_t v522 = vadd_f32(v497, v499); + float32x2_t v523 = vsub_f32(v497, v499); + float32x2_t v576 = vrev64_f32(v489); + float32x2_t v510 = vadd_f32(v502, v504); + float32x2_t v511 = vsub_f32(v502, v504); + float32x2_t v512 = vadd_f32(v506, v508); + float32x2_t v513 = vsub_f32(v506, v508); + float32x2_t v516 = vadd_f32(v507, v509); + float32x2_t v517 = vsub_f32(v507, v509); + float32x2_t v524 = vadd_f32(v520, v522); + float32x2_t v525 = vadd_f32(v521, v523); + float32x2_t v554 = vrev64_f32(v505); + float32x2_t v577 = vmul_f32(v576, v575); + float32x2_t v583 = vrev64_f32(v518); + float32x2_t v588 = vmul_f32(v519, v587); + float32x2_t v601 = vrev64_f32(v520); + float32x2_t v608 = vrev64_f32(v522); + float32x2_t v617 = vmul_f32(v521, v616); + float32x2_t v621 = vmul_f32(v523, v620); + float32x2_t v514 = vadd_f32(v510, v512); + float32x2_t v515 = vsub_f32(v510, v512); + float32x2_t v543 = vrev64_f32(v513); + float32x2_t v555 = vmul_f32(v554, v575); + float32x2_t v561 = vrev64_f32(v516); + float32x2_t v566 = vmul_f32(v517, v587); + float32x2_t v584 = vmul_f32(v583, v582); + float32x2_t v594 = vrev64_f32(v524); + float32x2_t v602 = vmul_f32(v601, v600); + float32x2_t v609 = vmul_f32(v608, v607); + float32x2_t v613 = vmul_f32(v525, v612); + float32x2_t v632 = vadd_f32(v487, v588); + float32x2_t v633 = vsub_f32(v487, v588); + float32x2_t v544 = vmul_f32(v543, v575); + float32x2_t v562 = vmul_f32(v561, v582); + float32x2_t v595 = vmul_f32(v594, v593); + float32x2_t v624 = vadd_f32(v503, v566); + float32x2_t v626 = vsub_f32(v503, v566); + float32x2_t v634 = vadd_f32(v577, v584); + float32x2_t v635 = vsub_f32(v577, v584); + float32x2_t v638 = vsub_f32(v617, v613); + float32x2_t v639 = vsub_f32(v621, v613); + float32x2_t v640 = vsub_f32(v613, v617); + float32x2_t v641 = vsub_f32(v613, v621); + int16x4_t v668 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v514, 15), (int32x2_t){0, 0})); + int16x4_t v716 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v515, 15), (int32x2_t){0, 0})); + float32x2_t v622 = vadd_f32(v511, v544); + float32x2_t v623 = vsub_f32(v511, v544); + float32x2_t v625 = vadd_f32(v555, v562); + float32x2_t v627 = vsub_f32(v562, v555); + float32x2_t v636 = vadd_f32(v595, v602); + float32x2_t v637 = vsub_f32(v595, v609); + float32x2_t v642 = vadd_f32(v632, v638); + float32x2_t v643 = vsub_f32(v632, v638); + float32x2_t v644 = vadd_f32(v632, v640); + float32x2_t v645 = vsub_f32(v632, v640); + float32x2_t v646 = vadd_f32(v633, v635); + float32x2_t v647 = vsub_f32(v633, v635); + float32x2_t v648 = vadd_f32(v633, v641); + float32x2_t v649 = vsub_f32(v633, v641); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v668), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v716), 0); + float32x2_t v628 = vadd_f32(v624, v625); + float32x2_t v629 = vadd_f32(v626, v627); + float32x2_t v630 = vsub_f32(v626, v627); + float32x2_t v631 = vsub_f32(v624, v625); + float32x2_t v652 = vadd_f32(v636, v634); + float32x2_t v653 = vsub_f32(v636, v634); + float32x2_t v654 = vadd_f32(v637, v639); + float32x2_t v655 = vsub_f32(v637, v639); + float32x2_t v656 = vadd_f32(v637, v635); + float32x2_t v657 = vsub_f32(v637, v635); + int16x4_t v692 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v623, 15), (int32x2_t){0, 0})); + int16x4_t v740 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v622, 15), (int32x2_t){0, 0})); + float32x2_t v658 = vadd_f32(v642, v652); + float32x2_t v659 = vadd_f32(v643, v653); + float32x2_t v660 = vsub_f32(v644, v653); + float32x2_t v661 = vsub_f32(v645, v652); + float32x2_t v662 = vadd_f32(v646, v654); + float32x2_t v663 = vadd_f32(v647, v655); + float32x2_t v664 = vsub_f32(v648, v657); + float32x2_t v665 = vsub_f32(v649, v656); + int16x4_t v680 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v631, 15), (int32x2_t){0, 0})); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v692), 0); + int16x4_t v704 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v630, 15), (int32x2_t){0, 0})); + int16x4_t v728 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v629, 15), (int32x2_t){0, 0})); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v740), 0); + int16x4_t v752 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v628, 15), (int32x2_t){0, 0})); + int16x4_t v674 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v661, 15), (int32x2_t){0, 0})); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v680), 0); + int16x4_t v686 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v664, 15), (int32x2_t){0, 0})); + int16x4_t v698 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v665, 15), (int32x2_t){0, 0})); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v704), 0); + int16x4_t v710 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v660, 15), (int32x2_t){0, 0})); + int16x4_t v722 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v659, 15), (int32x2_t){0, 0})); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v728), 0); + int16x4_t v734 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v662, 15), (int32x2_t){0, 0})); + int16x4_t v746 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v663, 15), (int32x2_t){0, 0})); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v752), 0); + int16x4_t v758 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v658, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v674), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v686), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v698), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v710), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v722), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v734), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v746), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v758), 0); + v5 += 1 * idist; + v6 += 1 * odist; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs16(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + int64_t v3 = odist; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * v3; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + int64_t v13 = j; + float v432 = -1.0000000000000000e+00F; + float v439 = -7.0710678118654746e-01F; + float v446 = 7.0710678118654757e-01F; + float v451 = -9.2387953251128674e-01F; + float v458 = 5.4119610014619690e-01F; + float v465 = -1.3065629648763766e+00F; + float v472 = 3.8268343236508984e-01F; + float v477 = 1.3065629648763766e+00F; + float v482 = -5.4119610014619690e-01F; + const float32x2_t *v727 = &v5[v0]; + int32_t *v839 = &v6[v2]; + int64_t v19 = v0 * 8; + int64_t v34 = v10 * 7; + int64_t v40 = v0 * 4; + int64_t v54 = v0 * 12; + int64_t v69 = v10 * 3; + int64_t v76 = v10 * 11; + int64_t v82 = v0 * 2; + int64_t v96 = v0 * 10; + int64_t v118 = v10 * 9; + int64_t v124 = v0 * 6; + int64_t v138 = v0 * 14; + int64_t v153 = v10 * 5; + int64_t v160 = v10 * 13; + int64_t v180 = v0 * 9; + int64_t v202 = v10 * 8; + int64_t v208 = v0 * 5; + int64_t v222 = v0 * 13; + int64_t v237 = v10 * 4; + int64_t v244 = v10 * 12; + int64_t v250 = v0 * 3; + int64_t v264 = v0 * 11; + int64_t v279 = v10 * 2; + int64_t v286 = v10 * 10; + int64_t v292 = v0 * 7; + int64_t v306 = v0 * 15; + int64_t v321 = v10 * 6; + int64_t v328 = v10 * 14; + int64_t v329 = v13 * 15; + float v435 = v4 * v432; + float v442 = v4 * v439; + float v454 = v4 * v451; + float v461 = v4 * v458; + float v468 = v4 * v465; + int64_t v547 = v2 * 2; + int64_t v555 = v2 * 3; + int64_t v563 = v2 * 4; + int64_t v571 = v2 * 5; + int64_t v579 = v2 * 6; + int64_t v587 = v2 * 7; + int64_t v595 = v2 * 8; + int64_t v603 = v2 * 9; + int64_t v611 = v2 * 10; + int64_t v619 = v2 * 11; + int64_t v627 = v2 * 12; + int64_t v635 = v2 * 13; + int64_t v643 = v2 * 14; + int64_t v651 = v2 * 15; + const float32x2_t *v802 = &v5[0]; + svint64_t v803 = svindex_s64(0, v1); + svfloat32_t v816 = svdup_n_f32(v446); + svfloat32_t v820 = svdup_n_f32(v472); + svfloat32_t v821 = svdup_n_f32(v477); + svfloat32_t v822 = svdup_n_f32(v482); + int32_t *v830 = &v6[0]; + svint64_t v966 = svindex_s64(0, v3); + int64_t v36 = v34 + v329; + int64_t v71 = v69 + v329; + int64_t v78 = v76 + v329; + int64_t v113 = v10 + v329; + int64_t v120 = v118 + v329; + int64_t v155 = v153 + v329; + int64_t v162 = v160 + v329; + svfloat32_t v198 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v329])); + int64_t v204 = v202 + v329; + int64_t v239 = v237 + v329; + int64_t v246 = v244 + v329; + int64_t v281 = v279 + v329; + int64_t v288 = v286 + v329; + int64_t v323 = v321 + v329; + int64_t v330 = v328 + v329; + const float32x2_t *v664 = &v5[v19]; + const float32x2_t *v673 = &v5[v40]; + const float32x2_t *v682 = &v5[v54]; + const float32x2_t *v691 = &v5[v82]; + const float32x2_t *v700 = &v5[v96]; + const float32x2_t *v709 = &v5[v124]; + const float32x2_t *v718 = &v5[v138]; + svfloat32_t v729 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v727), v803)); + const float32x2_t *v737 = &v5[v180]; + const float32x2_t *v747 = &v5[v208]; + const float32x2_t *v756 = &v5[v222]; + const float32x2_t *v765 = &v5[v250]; + const float32x2_t *v774 = &v5[v264]; + const float32x2_t *v783 = &v5[v292]; + const float32x2_t *v792 = &v5[v306]; + svfloat32_t v804 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v802), v803)); + svfloat32_t v814 = svdup_n_f32(v435); + svfloat32_t v815 = svdup_n_f32(v442); + svfloat32_t v817 = svdup_n_f32(v454); + svfloat32_t v818 = svdup_n_f32(v461); + svfloat32_t v819 = svdup_n_f32(v468); + int32_t *v848 = &v6[v547]; + int32_t *v857 = &v6[v555]; + int32_t *v866 = &v6[v563]; + int32_t *v875 = &v6[v571]; + int32_t *v884 = &v6[v579]; + int32_t *v893 = &v6[v587]; + int32_t *v902 = &v6[v595]; + int32_t *v911 = &v6[v603]; + int32_t *v920 = &v6[v611]; + int32_t *v929 = &v6[v619]; + int32_t *v938 = &v6[v627]; + int32_t *v947 = &v6[v635]; + int32_t *v956 = &v6[v643]; + int32_t *v965 = &v6[v651]; + svfloat32_t v37 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v36])); + svfloat32_t v72 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v71])); + svfloat32_t v79 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v78])); + svfloat32_t v114 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v113])); + svfloat32_t v121 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v120])); + svfloat32_t v156 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v155])); + svfloat32_t v163 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v162])); + svfloat32_t zero199 = svdup_n_f32(0); + svfloat32_t v199 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero199, v729, v198, 0), + v729, v198, 90); + svfloat32_t v205 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v204])); + svfloat32_t v240 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v239])); + svfloat32_t v247 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v246])); + svfloat32_t v282 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v281])); + svfloat32_t v289 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v288])); + svfloat32_t v324 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v323])); + svfloat32_t v331 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v330])); + svfloat32_t v666 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v664), v803)); + svfloat32_t v675 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v673), v803)); + svfloat32_t v684 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v682), v803)); + svfloat32_t v693 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v691), v803)); + svfloat32_t v702 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v700), v803)); + svfloat32_t v711 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v709), v803)); + svfloat32_t v720 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v718), v803)); + svfloat32_t v739 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v737), v803)); + svfloat32_t v749 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v747), v803)); + svfloat32_t v758 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v756), v803)); + svfloat32_t v767 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v765), v803)); + svfloat32_t v776 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v774), v803)); + svfloat32_t v785 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v783), v803)); + svfloat32_t v794 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v792), v803)); + svfloat32_t zero38 = svdup_n_f32(0); + svfloat32_t v38 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v666, v37, 0), + v666, v37, 90); + svfloat32_t zero73 = svdup_n_f32(0); + svfloat32_t v73 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v675, v72, 0), + v675, v72, 90); + svfloat32_t zero80 = svdup_n_f32(0); + svfloat32_t v80 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v684, v79, 0), + v684, v79, 90); + svfloat32_t zero115 = svdup_n_f32(0); + svfloat32_t v115 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero115, v693, v114, 0), + v693, v114, 90); + svfloat32_t zero122 = svdup_n_f32(0); + svfloat32_t v122 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v702, v121, 0), + v702, v121, 90); + svfloat32_t zero157 = svdup_n_f32(0); + svfloat32_t v157 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v711, v156, 0), + v711, v156, 90); + svfloat32_t zero164 = svdup_n_f32(0); + svfloat32_t v164 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero164, v720, v163, 0), + v720, v163, 90); + svfloat32_t zero206 = svdup_n_f32(0); + svfloat32_t v206 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v739, v205, 0), + v739, v205, 90); + svfloat32_t zero241 = svdup_n_f32(0); + svfloat32_t v241 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero241, v749, v240, 0), + v749, v240, 90); + svfloat32_t zero248 = svdup_n_f32(0); + svfloat32_t v248 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero248, v758, v247, 0), + v758, v247, 90); + svfloat32_t zero283 = svdup_n_f32(0); + svfloat32_t v283 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero283, v767, v282, 0), + v767, v282, 90); + svfloat32_t zero290 = svdup_n_f32(0); + svfloat32_t v290 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero290, v776, v289, 0), + v776, v289, 90); + svfloat32_t zero325 = svdup_n_f32(0); + svfloat32_t v325 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero325, v785, v324, 0), + v785, v324, 90); + svfloat32_t zero332 = svdup_n_f32(0); + svfloat32_t v332 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero332, v794, v331, 0), + v794, v331, 90); + svfloat32_t v340 = svadd_f32_x(svptrue_b32(), v804, v38); + svfloat32_t v341 = svsub_f32_x(svptrue_b32(), v804, v38); + svfloat32_t v342 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v343 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v344 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v345 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v346 = svadd_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v347 = svsub_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v348 = svadd_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v349 = svsub_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v350 = svadd_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v351 = svsub_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v352 = svadd_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v353 = svsub_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v354 = svadd_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v355 = svsub_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v356 = svadd_f32_x(svptrue_b32(), v340, v342); + svfloat32_t v357 = svsub_f32_x(svptrue_b32(), v340, v342); + svfloat32_t v358 = svadd_f32_x(svptrue_b32(), v344, v346); + svfloat32_t v359 = svsub_f32_x(svptrue_b32(), v344, v346); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v348, v350); + svfloat32_t v361 = svsub_f32_x(svptrue_b32(), v348, v350); + svfloat32_t v362 = svadd_f32_x(svptrue_b32(), v352, v354); + svfloat32_t v363 = svsub_f32_x(svptrue_b32(), v352, v354); + svfloat32_t v372 = svadd_f32_x(svptrue_b32(), v345, v347); + svfloat32_t v373 = svsub_f32_x(svptrue_b32(), v345, v347); + svfloat32_t v374 = svadd_f32_x(svptrue_b32(), v349, v355); + svfloat32_t v375 = svsub_f32_x(svptrue_b32(), v349, v355); + svfloat32_t v376 = svadd_f32_x(svptrue_b32(), v351, v353); + svfloat32_t v377 = svsub_f32_x(svptrue_b32(), v351, v353); + svfloat32_t zero437 = svdup_n_f32(0); + svfloat32_t v437 = svcmla_f32_x(pred_full, zero437, v814, v343, 90); + svfloat32_t v364 = svadd_f32_x(svptrue_b32(), v356, v358); + svfloat32_t v365 = svsub_f32_x(svptrue_b32(), v356, v358); + svfloat32_t v366 = svadd_f32_x(svptrue_b32(), v360, v362); + svfloat32_t v367 = svsub_f32_x(svptrue_b32(), v360, v362); + svfloat32_t v370 = svadd_f32_x(svptrue_b32(), v361, v363); + svfloat32_t v371 = svsub_f32_x(svptrue_b32(), v361, v363); + svfloat32_t v378 = svadd_f32_x(svptrue_b32(), v374, v376); + svfloat32_t v379 = svadd_f32_x(svptrue_b32(), v375, v377); + svfloat32_t zero413 = svdup_n_f32(0); + svfloat32_t v413 = svcmla_f32_x(pred_full, zero413, v814, v359, 90); + svfloat32_t zero444 = svdup_n_f32(0); + svfloat32_t v444 = svcmla_f32_x(pred_full, zero444, v815, v372, 90); + svfloat32_t zero470 = svdup_n_f32(0); + svfloat32_t v470 = svcmla_f32_x(pred_full, zero470, v819, v376, 90); + svfloat32_t v480 = svmul_f32_x(svptrue_b32(), v375, v821); + svfloat32_t v485 = svmul_f32_x(svptrue_b32(), v377, v822); + svfloat32_t v368 = svadd_f32_x(svptrue_b32(), v364, v366); + svfloat32_t v369 = svsub_f32_x(svptrue_b32(), v364, v366); + svfloat32_t zero401 = svdup_n_f32(0); + svfloat32_t v401 = svcmla_f32_x(pred_full, zero401, v814, v367, 90); + svfloat32_t zero420 = svdup_n_f32(0); + svfloat32_t v420 = svcmla_f32_x(pred_full, zero420, v815, v370, 90); + svfloat32_t zero456 = svdup_n_f32(0); + svfloat32_t v456 = svcmla_f32_x(pred_full, zero456, v817, v378, 90); + svfloat32_t v475 = svmul_f32_x(svptrue_b32(), v379, v820); + svfloat32_t v496 = svmla_f32_x(pred_full, v341, v373, v816); + svfloat32_t v497 = svmls_f32_x(pred_full, v341, v373, v816); + svfloat32_t v498 = svadd_f32_x(svptrue_b32(), v437, v444); + svfloat32_t v499 = svsub_f32_x(svptrue_b32(), v437, v444); + svfloat32_t v486 = svadd_f32_x(svptrue_b32(), v365, v401); + svfloat32_t v487 = svsub_f32_x(svptrue_b32(), v365, v401); + svfloat32_t v488 = svmla_f32_x(pred_full, v357, v371, v816); + svfloat32_t v489 = svadd_f32_x(svptrue_b32(), v413, v420); + svfloat32_t v490 = svmls_f32_x(pred_full, v357, v371, v816); + svfloat32_t v491 = svsub_f32_x(svptrue_b32(), v420, v413); + svfloat32_t v500 = svcmla_f32_x(pred_full, v456, v818, v374, 90); + svfloat32_t v501 = svsub_f32_x(svptrue_b32(), v456, v470); + svfloat32_t v502 = svnmls_f32_x(pred_full, v475, v375, v821); + svfloat32_t v503 = svnmls_f32_x(pred_full, v475, v377, v822); + svfloat32_t v504 = svnmls_f32_x(pred_full, v480, v379, v820); + svfloat32_t v505 = svnmls_f32_x(pred_full, v485, v379, v820); + svfloat32_t v510 = svadd_f32_x(svptrue_b32(), v497, v499); + svfloat32_t v511 = svsub_f32_x(svptrue_b32(), v497, v499); + svint16_t v532 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v368, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v596 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v369, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v492 = svadd_f32_x(svptrue_b32(), v488, v489); + svfloat32_t v493 = svadd_f32_x(svptrue_b32(), v490, v491); + svfloat32_t v494 = svsub_f32_x(svptrue_b32(), v490, v491); + svfloat32_t v495 = svsub_f32_x(svptrue_b32(), v488, v489); + svfloat32_t v506 = svadd_f32_x(svptrue_b32(), v496, v502); + svfloat32_t v507 = svsub_f32_x(svptrue_b32(), v496, v502); + svfloat32_t v508 = svadd_f32_x(svptrue_b32(), v496, v504); + svfloat32_t v509 = svsub_f32_x(svptrue_b32(), v496, v504); + svfloat32_t v512 = svadd_f32_x(svptrue_b32(), v497, v505); + svfloat32_t v513 = svsub_f32_x(svptrue_b32(), v497, v505); + svfloat32_t v516 = svadd_f32_x(svptrue_b32(), v500, v498); + svfloat32_t v517 = svsub_f32_x(svptrue_b32(), v500, v498); + svfloat32_t v518 = svadd_f32_x(svptrue_b32(), v501, v503); + svfloat32_t v519 = svsub_f32_x(svptrue_b32(), v501, v503); + svfloat32_t v520 = svadd_f32_x(svptrue_b32(), v501, v499); + svfloat32_t v521 = svsub_f32_x(svptrue_b32(), v501, v499); + svint16_t v564 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v487, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v628 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v486, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v830), v966, + svreinterpret_u64_s16(v532)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v902), v966, + svreinterpret_u64_s16(v596)); + svfloat32_t v522 = svadd_f32_x(svptrue_b32(), v506, v516); + svfloat32_t v523 = svadd_f32_x(svptrue_b32(), v507, v517); + svfloat32_t v524 = svsub_f32_x(svptrue_b32(), v508, v517); + svfloat32_t v525 = svsub_f32_x(svptrue_b32(), v509, v516); + svfloat32_t v526 = svadd_f32_x(svptrue_b32(), v510, v518); + svfloat32_t v527 = svadd_f32_x(svptrue_b32(), v511, v519); + svfloat32_t v528 = svsub_f32_x(svptrue_b32(), v512, v521); + svfloat32_t v529 = svsub_f32_x(svptrue_b32(), v513, v520); + svint16_t v548 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v495, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v580 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v494, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v612 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v493, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v644 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v492, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v866), v966, + svreinterpret_u64_s16(v564)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v938), v966, + svreinterpret_u64_s16(v628)); + svint16_t v540 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v525, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v556 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v528, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v572 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v529, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v588 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v524, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v604 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v523, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v620 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v526, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v636 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v527, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v652 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v522, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v848), v966, + svreinterpret_u64_s16(v548)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v884), v966, + svreinterpret_u64_s16(v580)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v920), v966, + svreinterpret_u64_s16(v612)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v956), v966, + svreinterpret_u64_s16(v644)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v839), v966, + svreinterpret_u64_s16(v540)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v857), v966, + svreinterpret_u64_s16(v556)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v875), v966, + svreinterpret_u64_s16(v572)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v893), v966, + svreinterpret_u64_s16(v588)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v911), v966, + svreinterpret_u64_s16(v604)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v929), v966, + svreinterpret_u64_s16(v620)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v947), v966, + svreinterpret_u64_s16(v636)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v965), v966, + svreinterpret_u64_s16(v652)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs17(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v20 = v5[istride]; + float v589 = -4.2602849117736000e-02F; + float v593 = 2.0497965023262180e-01F; + float v597 = 1.0451835201736759e+00F; + float v601 = 1.7645848660222969e+00F; + float v605 = -7.2340797728605655e-01F; + float v609 = -8.9055591620606403e-02F; + float v613 = -1.0625000000000000e+00F; + float v617 = 2.5769410160110379e-01F; + float v621 = 7.7980260789483757e-01F; + float v625 = 5.4389318464570580e-01F; + float v629 = 4.2010193497052700e-01F; + float v633 = 1.2810929434228073e+00F; + float v637 = 4.4088907348175338e-01F; + float v641 = 3.1717619283272508e-01F; + float v644 = -9.0138318648016680e-01F; + float v645 = 9.0138318648016680e-01F; + float v651 = -4.3248756360072310e-01F; + float v652 = 4.3248756360072310e-01F; + float v658 = 6.6693537504044498e-01F; + float v659 = -6.6693537504044498e-01F; + float v665 = -6.0389004312516970e-01F; + float v666 = 6.0389004312516970e-01F; + float v672 = -3.6924873198582547e-01F; + float v673 = 3.6924873198582547e-01F; + float v679 = 4.8656938755549761e-01F; + float v680 = -4.8656938755549761e-01F; + float v686 = 2.3813712136760609e-01F; + float v687 = -2.3813712136760609e-01F; + float v693 = -1.5573820617422458e+00F; + float v694 = 1.5573820617422458e+00F; + float v700 = 6.5962247018731990e-01F; + float v701 = -6.5962247018731990e-01F; + float v707 = -1.4316961569866241e-01F; + float v708 = 1.4316961569866241e-01F; + float v714 = 2.3903469959860771e-01F; + float v715 = -2.3903469959860771e-01F; + float v721 = -4.7932541949972603e-02F; + float v722 = 4.7932541949972603e-02F; + float v728 = -2.3188014856550065e+00F; + float v729 = 2.3188014856550065e+00F; + float v735 = 7.8914568419206255e-01F; + float v736 = -7.8914568419206255e-01F; + float v742 = 3.8484572871179505e+00F; + float v743 = -3.8484572871179505e+00F; + float v749 = -1.3003804568801376e+00F; + float v750 = 1.3003804568801376e+00F; + float v756 = 4.0814769046889037e+00F; + float v757 = -4.0814769046889037e+00F; + float v763 = -1.4807159909286283e+00F; + float v764 = 1.4807159909286283e+00F; + float v770 = -1.3332470363551400e-02F; + float v771 = 1.3332470363551400e-02F; + float v777 = -3.7139778690557629e-01F; + float v778 = 3.7139778690557629e-01F; + float v784 = 1.9236512863456379e-01F; + float v785 = -1.9236512863456379e-01F; + float32x2_t v787 = (float32x2_t){v4, v4}; + float32x2_t v57 = vtrn1_f32(v20, v20); + float32x2_t v58 = vtrn2_f32(v20, v20); + float32x2_t v582 = v5[0]; + float32x2_t v590 = (float32x2_t){v589, v589}; + float32x2_t v594 = (float32x2_t){v593, v593}; + float32x2_t v598 = (float32x2_t){v597, v597}; + float32x2_t v602 = (float32x2_t){v601, v601}; + float32x2_t v606 = (float32x2_t){v605, v605}; + float32x2_t v610 = (float32x2_t){v609, v609}; + float32x2_t v614 = (float32x2_t){v613, v613}; + float32x2_t v618 = (float32x2_t){v617, v617}; + float32x2_t v622 = (float32x2_t){v621, v621}; + float32x2_t v626 = (float32x2_t){v625, v625}; + float32x2_t v630 = (float32x2_t){v629, v629}; + float32x2_t v634 = (float32x2_t){v633, v633}; + float32x2_t v638 = (float32x2_t){v637, v637}; + float32x2_t v642 = (float32x2_t){v641, v641}; + float32x2_t v646 = (float32x2_t){v644, v645}; + float32x2_t v653 = (float32x2_t){v651, v652}; + float32x2_t v660 = (float32x2_t){v658, v659}; + float32x2_t v667 = (float32x2_t){v665, v666}; + float32x2_t v674 = (float32x2_t){v672, v673}; + float32x2_t v681 = (float32x2_t){v679, v680}; + float32x2_t v688 = (float32x2_t){v686, v687}; + float32x2_t v695 = (float32x2_t){v693, v694}; + float32x2_t v702 = (float32x2_t){v700, v701}; + float32x2_t v709 = (float32x2_t){v707, v708}; + float32x2_t v716 = (float32x2_t){v714, v715}; + float32x2_t v723 = (float32x2_t){v721, v722}; + float32x2_t v730 = (float32x2_t){v728, v729}; + float32x2_t v737 = (float32x2_t){v735, v736}; + float32x2_t v744 = (float32x2_t){v742, v743}; + float32x2_t v751 = (float32x2_t){v749, v750}; + float32x2_t v758 = (float32x2_t){v756, v757}; + float32x2_t v765 = (float32x2_t){v763, v764}; + float32x2_t v772 = (float32x2_t){v770, v771}; + float32x2_t v779 = (float32x2_t){v777, v778}; + float32x2_t v786 = (float32x2_t){v784, v785}; + float32x2_t v38 = v5[istride * 16]; + float32x2_t v56 = v7[j * 32]; + int64_t v60 = j * 32 + 1; + int64_t v68 = 30 + j * 32; + float32x2_t v82 = v5[istride * 3]; + float32x2_t v100 = v5[istride * 14]; + int64_t v117 = 4 + j * 32; + int64_t v130 = 26 + j * 32; + float32x2_t v144 = v5[istride * 9]; + float32x2_t v162 = v5[istride * 8]; + int64_t v179 = 16 + j * 32; + int64_t v192 = 14 + j * 32; + float32x2_t v206 = v5[istride * 10]; + float32x2_t v224 = v5[istride * 7]; + int64_t v241 = 18 + j * 32; + int64_t v254 = 12 + j * 32; + float32x2_t v268 = v5[istride * 13]; + float32x2_t v286 = v5[istride * 4]; + int64_t v303 = 24 + j * 32; + int64_t v316 = 6 + j * 32; + float32x2_t v330 = v5[istride * 5]; + float32x2_t v348 = v5[istride * 12]; + int64_t v365 = 8 + j * 32; + int64_t v378 = 22 + j * 32; + float32x2_t v392 = v5[istride * 15]; + float32x2_t v410 = v5[istride * 2]; + int64_t v427 = 28 + j * 32; + int64_t v440 = 2 + j * 32; + float32x2_t v454 = v5[istride * 11]; + float32x2_t v472 = v5[istride * 6]; + int64_t v489 = 20 + j * 32; + int64_t v502 = 10 + j * 32; + float32x2_t v648 = vmul_f32(v787, v646); + float32x2_t v655 = vmul_f32(v787, v653); + float32x2_t v662 = vmul_f32(v787, v660); + float32x2_t v669 = vmul_f32(v787, v667); + float32x2_t v676 = vmul_f32(v787, v674); + float32x2_t v683 = vmul_f32(v787, v681); + float32x2_t v690 = vmul_f32(v787, v688); + float32x2_t v697 = vmul_f32(v787, v695); + float32x2_t v704 = vmul_f32(v787, v702); + float32x2_t v711 = vmul_f32(v787, v709); + float32x2_t v718 = vmul_f32(v787, v716); + float32x2_t v725 = vmul_f32(v787, v723); + float32x2_t v732 = vmul_f32(v787, v730); + float32x2_t v739 = vmul_f32(v787, v737); + float32x2_t v746 = vmul_f32(v787, v744); + float32x2_t v753 = vmul_f32(v787, v751); + float32x2_t v760 = vmul_f32(v787, v758); + float32x2_t v767 = vmul_f32(v787, v765); + float32x2_t v774 = vmul_f32(v787, v772); + float32x2_t v781 = vmul_f32(v787, v779); + float32x2_t v788 = vmul_f32(v787, v786); + float32x2_t v61 = v7[v60]; + float32x2_t v62 = vmul_f32(v57, v56); + float32x2_t v69 = v7[v68]; + float32x2_t v70 = vtrn1_f32(v38, v38); + float32x2_t v71 = vtrn2_f32(v38, v38); + int64_t v73 = v68 + 1; + float32x2_t v118 = v7[v117]; + float32x2_t v119 = vtrn1_f32(v82, v82); + float32x2_t v120 = vtrn2_f32(v82, v82); + int64_t v122 = v117 + 1; + float32x2_t v131 = v7[v130]; + float32x2_t v132 = vtrn1_f32(v100, v100); + float32x2_t v133 = vtrn2_f32(v100, v100); + int64_t v135 = v130 + 1; + float32x2_t v180 = v7[v179]; + float32x2_t v181 = vtrn1_f32(v144, v144); + float32x2_t v182 = vtrn2_f32(v144, v144); + int64_t v184 = v179 + 1; + float32x2_t v193 = v7[v192]; + float32x2_t v194 = vtrn1_f32(v162, v162); + float32x2_t v195 = vtrn2_f32(v162, v162); + int64_t v197 = v192 + 1; + float32x2_t v242 = v7[v241]; + float32x2_t v243 = vtrn1_f32(v206, v206); + float32x2_t v244 = vtrn2_f32(v206, v206); + int64_t v246 = v241 + 1; + float32x2_t v255 = v7[v254]; + float32x2_t v256 = vtrn1_f32(v224, v224); + float32x2_t v257 = vtrn2_f32(v224, v224); + int64_t v259 = v254 + 1; + float32x2_t v304 = v7[v303]; + float32x2_t v305 = vtrn1_f32(v268, v268); + float32x2_t v306 = vtrn2_f32(v268, v268); + int64_t v308 = v303 + 1; + float32x2_t v317 = v7[v316]; + float32x2_t v318 = vtrn1_f32(v286, v286); + float32x2_t v319 = vtrn2_f32(v286, v286); + int64_t v321 = v316 + 1; + float32x2_t v366 = v7[v365]; + float32x2_t v367 = vtrn1_f32(v330, v330); + float32x2_t v368 = vtrn2_f32(v330, v330); + int64_t v370 = v365 + 1; + float32x2_t v379 = v7[v378]; + float32x2_t v380 = vtrn1_f32(v348, v348); + float32x2_t v381 = vtrn2_f32(v348, v348); + int64_t v383 = v378 + 1; + float32x2_t v428 = v7[v427]; + float32x2_t v429 = vtrn1_f32(v392, v392); + float32x2_t v430 = vtrn2_f32(v392, v392); + int64_t v432 = v427 + 1; + float32x2_t v441 = v7[v440]; + float32x2_t v442 = vtrn1_f32(v410, v410); + float32x2_t v443 = vtrn2_f32(v410, v410); + int64_t v445 = v440 + 1; + float32x2_t v490 = v7[v489]; + float32x2_t v491 = vtrn1_f32(v454, v454); + float32x2_t v492 = vtrn2_f32(v454, v454); + int64_t v494 = v489 + 1; + float32x2_t v503 = v7[v502]; + float32x2_t v504 = vtrn1_f32(v472, v472); + float32x2_t v505 = vtrn2_f32(v472, v472); + int64_t v507 = v502 + 1; + float32x2_t v74 = v7[v73]; + float32x2_t v75 = vmul_f32(v70, v69); + float32x2_t v123 = v7[v122]; + float32x2_t v124 = vmul_f32(v119, v118); + float32x2_t v136 = v7[v135]; + float32x2_t v137 = vmul_f32(v132, v131); + float32x2_t v185 = v7[v184]; + float32x2_t v186 = vmul_f32(v181, v180); + float32x2_t v198 = v7[v197]; + float32x2_t v199 = vmul_f32(v194, v193); + float32x2_t v247 = v7[v246]; + float32x2_t v248 = vmul_f32(v243, v242); + float32x2_t v260 = v7[v259]; + float32x2_t v261 = vmul_f32(v256, v255); + float32x2_t v309 = v7[v308]; + float32x2_t v310 = vmul_f32(v305, v304); + float32x2_t v322 = v7[v321]; + float32x2_t v323 = vmul_f32(v318, v317); + float32x2_t v371 = v7[v370]; + float32x2_t v372 = vmul_f32(v367, v366); + float32x2_t v384 = v7[v383]; + float32x2_t v385 = vmul_f32(v380, v379); + float32x2_t v433 = v7[v432]; + float32x2_t v434 = vmul_f32(v429, v428); + float32x2_t v446 = v7[v445]; + float32x2_t v447 = vmul_f32(v442, v441); + float32x2_t v495 = v7[v494]; + float32x2_t v496 = vmul_f32(v491, v490); + float32x2_t v508 = v7[v507]; + float32x2_t v509 = vmul_f32(v504, v503); + float32x2_t v64 = vfma_f32(v62, v58, v61); + float32x2_t v77 = vfma_f32(v75, v71, v74); + float32x2_t v126 = vfma_f32(v124, v120, v123); + float32x2_t v139 = vfma_f32(v137, v133, v136); + float32x2_t v188 = vfma_f32(v186, v182, v185); + float32x2_t v201 = vfma_f32(v199, v195, v198); + float32x2_t v250 = vfma_f32(v248, v244, v247); + float32x2_t v263 = vfma_f32(v261, v257, v260); + float32x2_t v312 = vfma_f32(v310, v306, v309); + float32x2_t v325 = vfma_f32(v323, v319, v322); + float32x2_t v374 = vfma_f32(v372, v368, v371); + float32x2_t v387 = vfma_f32(v385, v381, v384); + float32x2_t v436 = vfma_f32(v434, v430, v433); + float32x2_t v449 = vfma_f32(v447, v443, v446); + float32x2_t v498 = vfma_f32(v496, v492, v495); + float32x2_t v511 = vfma_f32(v509, v505, v508); + float32x2_t v512 = vadd_f32(v64, v77); + float32x2_t v513 = vsub_f32(v64, v77); + float32x2_t v514 = vadd_f32(v126, v139); + float32x2_t v515 = vsub_f32(v126, v139); + float32x2_t v516 = vadd_f32(v188, v201); + float32x2_t v517 = vsub_f32(v188, v201); + float32x2_t v518 = vadd_f32(v250, v263); + float32x2_t v519 = vsub_f32(v250, v263); + float32x2_t v520 = vadd_f32(v312, v325); + float32x2_t v521 = vsub_f32(v312, v325); + float32x2_t v522 = vadd_f32(v374, v387); + float32x2_t v523 = vsub_f32(v374, v387); + float32x2_t v524 = vadd_f32(v436, v449); + float32x2_t v525 = vsub_f32(v436, v449); + float32x2_t v526 = vadd_f32(v498, v511); + float32x2_t v527 = vsub_f32(v498, v511); + float32x2_t v528 = vadd_f32(v512, v520); + float32x2_t v529 = vadd_f32(v514, v522); + float32x2_t v530 = vadd_f32(v516, v524); + float32x2_t v531 = vadd_f32(v518, v526); + float32x2_t v534 = vsub_f32(v512, v520); + float32x2_t v535 = vsub_f32(v514, v522); + float32x2_t v536 = vsub_f32(v516, v524); + float32x2_t v537 = vsub_f32(v518, v526); + float32x2_t v548 = vadd_f32(v513, v517); + float32x2_t v549 = vadd_f32(v515, v519); + float32x2_t v550 = vsub_f32(v513, v517); + float32x2_t v551 = vsub_f32(v527, v523); + float32x2_t v552 = vadd_f32(v521, v525); + float32x2_t v553 = vadd_f32(v523, v527); + float32x2_t v554 = vsub_f32(v521, v525); + float32x2_t v555 = vsub_f32(v515, v519); + float32x2_t v568 = vadd_f32(v513, v521); + float32x2_t v569 = vadd_f32(v519, v527); + float32x2_t v740 = vrev64_f32(v513); + float32x2_t v747 = vrev64_f32(v521); + float32x2_t v761 = vrev64_f32(v519); + float32x2_t v768 = vrev64_f32(v527); + float32x2_t v532 = vadd_f32(v528, v530); + float32x2_t v533 = vadd_f32(v529, v531); + float32x2_t v538 = vsub_f32(v528, v530); + float32x2_t v539 = vsub_f32(v529, v531); + float32x2_t v542 = vadd_f32(v535, v537); + float32x2_t v543 = vadd_f32(v534, v536); + float32x2_t v545 = vsub_f32(v536, v537); + float32x2_t v546 = vsub_f32(v534, v535); + float32x2_t v556 = vadd_f32(v548, v549); + float32x2_t v557 = vadd_f32(v552, v553); + float32x2_t v559 = vsub_f32(v548, v549); + float32x2_t v560 = vsub_f32(v552, v553); + float32x2_t v562 = vadd_f32(v550, v551); + float32x2_t v563 = vadd_f32(v554, v555); + float32x2_t v565 = vsub_f32(v550, v551); + float32x2_t v566 = vsub_f32(v554, v555); + float32x2_t v591 = vmul_f32(v534, v590); + float32x2_t v595 = vmul_f32(v535, v594); + float32x2_t v599 = vmul_f32(v536, v598); + float32x2_t v603 = vmul_f32(v537, v602); + float32x2_t v733 = vrev64_f32(v568); + float32x2_t v741 = vmul_f32(v740, v739); + float32x2_t v748 = vmul_f32(v747, v746); + float32x2_t v754 = vrev64_f32(v569); + float32x2_t v762 = vmul_f32(v761, v760); + float32x2_t v769 = vmul_f32(v768, v767); + float32x2_t v540 = vadd_f32(v532, v533); + float32x2_t v541 = vsub_f32(v532, v533); + float32x2_t v544 = vsub_f32(v543, v542); + float32x2_t v547 = vadd_f32(v538, v539); + float32x2_t v558 = vadd_f32(v556, v557); + float32x2_t v561 = vadd_f32(v559, v560); + float32x2_t v564 = vadd_f32(v562, v563); + float32x2_t v567 = vadd_f32(v565, v566); + float32x2_t v570 = vsub_f32(v563, v557); + float32x2_t v573 = vsub_f32(v556, v562); + float32x2_t v607 = vmul_f32(v538, v606); + float32x2_t v611 = vmul_f32(v539, v610); + float32x2_t v623 = vmul_f32(v542, v622); + float32x2_t v627 = vmul_f32(v543, v626); + float32x2_t v635 = vmul_f32(v545, v634); + float32x2_t v639 = vmul_f32(v546, v638); + float32x2_t v649 = vrev64_f32(v556); + float32x2_t v656 = vrev64_f32(v557); + float32x2_t v670 = vrev64_f32(v559); + float32x2_t v677 = vrev64_f32(v560); + float32x2_t v691 = vrev64_f32(v562); + float32x2_t v698 = vrev64_f32(v563); + float32x2_t v712 = vrev64_f32(v565); + float32x2_t v719 = vrev64_f32(v566); + float32x2_t v734 = vmul_f32(v733, v732); + float32x2_t v755 = vmul_f32(v754, v753); + float32x2_t v571 = vadd_f32(v570, v513); + float32x2_t v574 = vadd_f32(v573, v519); + float32x2_t v583 = vadd_f32(v582, v540); + float32x2_t v615 = vmul_f32(v540, v614); + float32x2_t v619 = vmul_f32(v541, v618); + float32x2_t v631 = vmul_f32(v544, v630); + float32x2_t v643 = vmul_f32(v547, v642); + float32x2_t v650 = vmul_f32(v649, v648); + float32x2_t v657 = vmul_f32(v656, v655); + float32x2_t v663 = vrev64_f32(v558); + float32x2_t v671 = vmul_f32(v670, v669); + float32x2_t v678 = vmul_f32(v677, v676); + float32x2_t v684 = vrev64_f32(v561); + float32x2_t v692 = vmul_f32(v691, v690); + float32x2_t v699 = vmul_f32(v698, v697); + float32x2_t v705 = vrev64_f32(v564); + float32x2_t v713 = vmul_f32(v712, v711); + float32x2_t v720 = vmul_f32(v719, v718); + float32x2_t v726 = vrev64_f32(v567); + float32x2_t v793 = vadd_f32(v603, v635); + float32x2_t v794 = vsub_f32(v635, v599); + float32x2_t v795 = vadd_f32(v595, v639); + float32x2_t v796 = vsub_f32(v591, v639); + float32x2_t v572 = vsub_f32(v571, v569); + float32x2_t v575 = vadd_f32(v574, v521); + float32x2_t v664 = vmul_f32(v663, v662); + float32x2_t v685 = vmul_f32(v684, v683); + float32x2_t v706 = vmul_f32(v705, v704); + float32x2_t v727 = vmul_f32(v726, v725); + float32x2_t v791 = vadd_f32(v623, v631); + float32x2_t v792 = vsub_f32(v627, v631); + float32x2_t v797 = vsub_f32(v643, v611); + float32x2_t v798 = vadd_f32(v643, v607); + float32x2_t v799 = vadd_f32(v615, v583); + int16x4_t v867 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v583, 15), (int32x2_t){0, 0})); + float32x2_t v576 = vsub_f32(v575, v527); + float32x2_t v775 = vrev64_f32(v572); + float32x2_t v800 = vadd_f32(v619, v799); + float32x2_t v801 = vsub_f32(v799, v619); + float32x2_t v802 = vsub_f32(v791, v793); + float32x2_t v804 = vadd_f32(v792, v794); + float32x2_t v806 = vadd_f32(v791, v795); + float32x2_t v808 = vadd_f32(v792, v796); + float32x2_t v818 = vadd_f32(v650, v664); + float32x2_t v819 = vadd_f32(v657, v664); + float32x2_t v820 = vadd_f32(v671, v685); + float32x2_t v821 = vadd_f32(v678, v685); + float32x2_t v822 = vadd_f32(v692, v706); + float32x2_t v823 = vadd_f32(v699, v706); + float32x2_t v824 = vadd_f32(v713, v727); + float32x2_t v825 = vadd_f32(v720, v727); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v867), 0); + float32x2_t v577 = vadd_f32(v572, v576); + float32x2_t v776 = vmul_f32(v775, v774); + float32x2_t v782 = vrev64_f32(v576); + float32x2_t v803 = vadd_f32(v797, v800); + float32x2_t v805 = vadd_f32(v798, v801); + float32x2_t v807 = vsub_f32(v800, v797); + float32x2_t v809 = vsub_f32(v801, v798); + float32x2_t v829 = vadd_f32(v818, v820); + float32x2_t v830 = vsub_f32(v818, v820); + float32x2_t v831 = vadd_f32(v819, v821); + float32x2_t v832 = vsub_f32(v819, v821); + float32x2_t v833 = vadd_f32(v822, v824); + float32x2_t v834 = vsub_f32(v824, v822); + float32x2_t v835 = vadd_f32(v823, v825); + float32x2_t v836 = vsub_f32(v825, v823); + float32x2_t v783 = vmul_f32(v782, v781); + float32x2_t v789 = vrev64_f32(v577); + float32x2_t v810 = vadd_f32(v802, v803); + float32x2_t v811 = vadd_f32(v804, v805); + float32x2_t v812 = vadd_f32(v806, v807); + float32x2_t v813 = vadd_f32(v808, v809); + float32x2_t v814 = vsub_f32(v803, v802); + float32x2_t v815 = vsub_f32(v805, v804); + float32x2_t v816 = vsub_f32(v807, v806); + float32x2_t v817 = vsub_f32(v809, v808); + float32x2_t v846 = vadd_f32(v831, v835); + float32x2_t v848 = vadd_f32(v830, v836); + float32x2_t v850 = vsub_f32(v829, v833); + float32x2_t v852 = vsub_f32(v836, v830); + float32x2_t v854 = vadd_f32(v829, v833); + float32x2_t v857 = vsub_f32(v834, v832); + float32x2_t v860 = vsub_f32(v835, v831); + float32x2_t v863 = vadd_f32(v832, v834); + float32x2_t v790 = vmul_f32(v789, v788); + float32x2_t v837 = vsub_f32(v776, v783); + float32x2_t v826 = vadd_f32(v790, v783); + float32x2_t v839 = vadd_f32(v837, v837); + float32x2_t v864 = vsub_f32(v863, v837); + float32x2_t v827 = vadd_f32(v734, v826); + float32x2_t v840 = vsub_f32(v755, v839); + float32x2_t v843 = vadd_f32(v826, v826); + float32x2_t v861 = vadd_f32(v860, v839); + float32x2_t v899 = vadd_f32(v817, v864); + float32x2_t v906 = vsub_f32(v817, v864); + float32x2_t v828 = vadd_f32(v827, v741); + float32x2_t v838 = vadd_f32(v827, v748); + float32x2_t v841 = vadd_f32(v840, v762); + float32x2_t v842 = vadd_f32(v840, v769); + float32x2_t v844 = vadd_f32(v843, v843); + float32x2_t v845 = vadd_f32(v837, v843); + float32x2_t v851 = vadd_f32(v850, v843); + float32x2_t v862 = vadd_f32(v861, v843); + int16x4_t v902 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v899, 15), (int32x2_t){0, 0})); + int16x4_t v909 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v906, 15), (int32x2_t){0, 0})); + float32x2_t v847 = vadd_f32(v846, v838); + float32x2_t v849 = vadd_f32(v848, v841); + float32x2_t v853 = vsub_f32(v852, v845); + float32x2_t v855 = vadd_f32(v854, v828); + float32x2_t v858 = vsub_f32(v857, v842); + float32x2_t v885 = vadd_f32(v812, v851); + float32x2_t v892 = vsub_f32(v812, v851); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v902), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v909), 0); + float32x2_t v969 = vadd_f32(v816, v862); + float32x2_t v976 = vsub_f32(v816, v862); + float32x2_t v856 = vadd_f32(v855, v837); + float32x2_t v859 = vadd_f32(v858, v844); + float32x2_t v871 = vadd_f32(v810, v847); + float32x2_t v878 = vsub_f32(v810, v847); + int16x4_t v888 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v885, 15), (int32x2_t){0, 0})); + int16x4_t v895 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v892, 15), (int32x2_t){0, 0})); + float32x2_t v927 = vadd_f32(v813, v853); + float32x2_t v934 = vsub_f32(v813, v853); + float32x2_t v941 = vadd_f32(v811, v849); + float32x2_t v948 = vsub_f32(v811, v849); + int16x4_t v972 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v969, 15), (int32x2_t){0, 0})); + int16x4_t v979 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v976, 15), (int32x2_t){0, 0})); + int16x4_t v874 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v871, 15), (int32x2_t){0, 0})); + int16x4_t v881 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v878, 15), (int32x2_t){0, 0})); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v888), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v895), 0); + float32x2_t v913 = vadd_f32(v814, v856); + float32x2_t v920 = vsub_f32(v814, v856); + int16x4_t v930 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v927, 15), (int32x2_t){0, 0})); + int16x4_t v937 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v934, 15), (int32x2_t){0, 0})); + int16x4_t v944 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v941, 15), (int32x2_t){0, 0})); + int16x4_t v951 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v948, 15), (int32x2_t){0, 0})); + float32x2_t v955 = vadd_f32(v815, v859); + float32x2_t v962 = vsub_f32(v815, v859); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v972), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v979), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v874), 0); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v881), 0); + int16x4_t v916 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v913, 15), (int32x2_t){0, 0})); + int16x4_t v923 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v920, 15), (int32x2_t){0, 0})); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v930), 0); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v937), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v944), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v951), 0); + int16x4_t v958 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v955, 15), (int32x2_t){0, 0})); + int16x4_t v965 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v962, 15), (int32x2_t){0, 0})); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v916), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v923), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v958), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v965), 0); + v5 += 1 * idist; + v6 += 1 * odist; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs17(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + int64_t v3 = odist; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * v3; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + int64_t v13 = j; + float v434 = -4.2602849117736000e-02F; + float v439 = 2.0497965023262180e-01F; + float v444 = 1.0451835201736759e+00F; + float v449 = 1.7645848660222969e+00F; + float v454 = -7.2340797728605655e-01F; + float v459 = -8.9055591620606403e-02F; + float v464 = -1.0625000000000000e+00F; + float v469 = 2.5769410160110379e-01F; + float v474 = 7.7980260789483757e-01F; + float v479 = 5.4389318464570580e-01F; + float v484 = 4.2010193497052700e-01F; + float v489 = 1.2810929434228073e+00F; + float v494 = 4.4088907348175338e-01F; + float v499 = 3.1717619283272508e-01F; + float v504 = 9.0138318648016680e-01F; + float v511 = 4.3248756360072310e-01F; + float v518 = -6.6693537504044498e-01F; + float v525 = 6.0389004312516970e-01F; + float v532 = 3.6924873198582547e-01F; + float v539 = -4.8656938755549761e-01F; + float v546 = -2.3813712136760609e-01F; + float v553 = 1.5573820617422458e+00F; + float v560 = -6.5962247018731990e-01F; + float v567 = 1.4316961569866241e-01F; + float v574 = -2.3903469959860771e-01F; + float v581 = 4.7932541949972603e-02F; + float v588 = 2.3188014856550065e+00F; + float v595 = -7.8914568419206255e-01F; + float v602 = -3.8484572871179505e+00F; + float v609 = 1.3003804568801376e+00F; + float v616 = -4.0814769046889037e+00F; + float v623 = 1.4807159909286283e+00F; + float v630 = 1.3332470363551400e-02F; + float v637 = 3.7139778690557629e-01F; + float v644 = -1.9236512863456379e-01F; + const float32x2_t *v882 = &v5[v0]; + int32_t *v1084 = &v6[v2]; + int64_t v33 = v0 * 16; + int64_t v55 = v10 * 15; + int64_t v61 = v0 * 3; + int64_t v75 = v0 * 14; + int64_t v90 = v10 * 2; + int64_t v97 = v10 * 13; + int64_t v103 = v0 * 9; + int64_t v117 = v0 * 8; + int64_t v132 = v10 * 8; + int64_t v139 = v10 * 7; + int64_t v145 = v0 * 10; + int64_t v159 = v0 * 7; + int64_t v174 = v10 * 9; + int64_t v181 = v10 * 6; + int64_t v187 = v0 * 13; + int64_t v201 = v0 * 4; + int64_t v216 = v10 * 12; + int64_t v223 = v10 * 3; + int64_t v229 = v0 * 5; + int64_t v243 = v0 * 12; + int64_t v258 = v10 * 4; + int64_t v265 = v10 * 11; + int64_t v271 = v0 * 15; + int64_t v285 = v0 * 2; + int64_t v300 = v10 * 14; + int64_t v313 = v0 * 11; + int64_t v327 = v0 * 6; + int64_t v342 = v10 * 10; + int64_t v349 = v10 * 5; + int64_t v350 = v13 * 16; + float v507 = v4 * v504; + float v514 = v4 * v511; + float v521 = v4 * v518; + float v528 = v4 * v525; + float v535 = v4 * v532; + float v542 = v4 * v539; + float v549 = v4 * v546; + float v556 = v4 * v553; + float v563 = v4 * v560; + float v570 = v4 * v567; + float v577 = v4 * v574; + float v584 = v4 * v581; + float v591 = v4 * v588; + float v598 = v4 * v595; + float v605 = v4 * v602; + float v612 = v4 * v609; + float v619 = v4 * v616; + float v626 = v4 * v623; + float v633 = v4 * v630; + float v640 = v4 * v637; + float v647 = v4 * v644; + int64_t v743 = v2 * 16; + int64_t v752 = v2 * 2; + int64_t v761 = v2 * 15; + int64_t v770 = v2 * 3; + int64_t v779 = v2 * 14; + int64_t v788 = v2 * 4; + int64_t v797 = v2 * 13; + int64_t v806 = v2 * 5; + int64_t v815 = v2 * 12; + int64_t v824 = v2 * 6; + int64_t v833 = v2 * 11; + int64_t v842 = v2 * 7; + int64_t v851 = v2 * 10; + int64_t v860 = v2 * 8; + int64_t v869 = v2 * 9; + const float32x2_t *v1029 = &v5[0]; + svint64_t v1030 = svindex_s64(0, v1); + svfloat32_t v1033 = svdup_n_f32(v434); + svfloat32_t v1034 = svdup_n_f32(v439); + svfloat32_t v1035 = svdup_n_f32(v444); + svfloat32_t v1036 = svdup_n_f32(v449); + svfloat32_t v1037 = svdup_n_f32(v454); + svfloat32_t v1038 = svdup_n_f32(v459); + svfloat32_t v1039 = svdup_n_f32(v464); + svfloat32_t v1040 = svdup_n_f32(v469); + svfloat32_t v1041 = svdup_n_f32(v474); + svfloat32_t v1042 = svdup_n_f32(v479); + svfloat32_t v1043 = svdup_n_f32(v484); + svfloat32_t v1044 = svdup_n_f32(v489); + svfloat32_t v1045 = svdup_n_f32(v494); + svfloat32_t v1046 = svdup_n_f32(v499); + int32_t *v1075 = &v6[0]; + svint64_t v1220 = svindex_s64(0, v3); + svfloat32_t v51 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v350])); + int64_t v57 = v55 + v350; + int64_t v92 = v90 + v350; + int64_t v99 = v97 + v350; + int64_t v134 = v132 + v350; + int64_t v141 = v139 + v350; + int64_t v176 = v174 + v350; + int64_t v183 = v181 + v350; + int64_t v218 = v216 + v350; + int64_t v225 = v223 + v350; + int64_t v260 = v258 + v350; + int64_t v267 = v265 + v350; + int64_t v302 = v300 + v350; + int64_t v309 = v10 + v350; + int64_t v344 = v342 + v350; + int64_t v351 = v349 + v350; + svfloat32_t v884 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v882), v1030)); + const float32x2_t *v892 = &v5[v33]; + const float32x2_t *v902 = &v5[v61]; + const float32x2_t *v911 = &v5[v75]; + const float32x2_t *v920 = &v5[v103]; + const float32x2_t *v929 = &v5[v117]; + const float32x2_t *v938 = &v5[v145]; + const float32x2_t *v947 = &v5[v159]; + const float32x2_t *v956 = &v5[v187]; + const float32x2_t *v965 = &v5[v201]; + const float32x2_t *v974 = &v5[v229]; + const float32x2_t *v983 = &v5[v243]; + const float32x2_t *v992 = &v5[v271]; + const float32x2_t *v1001 = &v5[v285]; + const float32x2_t *v1010 = &v5[v313]; + const float32x2_t *v1019 = &v5[v327]; + svfloat32_t v1031 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1029), v1030)); + svfloat32_t v1047 = svdup_n_f32(v507); + svfloat32_t v1048 = svdup_n_f32(v514); + svfloat32_t v1049 = svdup_n_f32(v521); + svfloat32_t v1050 = svdup_n_f32(v528); + svfloat32_t v1051 = svdup_n_f32(v535); + svfloat32_t v1052 = svdup_n_f32(v542); + svfloat32_t v1053 = svdup_n_f32(v549); + svfloat32_t v1054 = svdup_n_f32(v556); + svfloat32_t v1055 = svdup_n_f32(v563); + svfloat32_t v1056 = svdup_n_f32(v570); + svfloat32_t v1057 = svdup_n_f32(v577); + svfloat32_t v1058 = svdup_n_f32(v584); + svfloat32_t v1059 = svdup_n_f32(v591); + svfloat32_t v1060 = svdup_n_f32(v598); + svfloat32_t v1061 = svdup_n_f32(v605); + svfloat32_t v1062 = svdup_n_f32(v612); + svfloat32_t v1063 = svdup_n_f32(v619); + svfloat32_t v1064 = svdup_n_f32(v626); + svfloat32_t v1065 = svdup_n_f32(v633); + svfloat32_t v1066 = svdup_n_f32(v640); + svfloat32_t v1067 = svdup_n_f32(v647); + int32_t *v1093 = &v6[v743]; + int32_t *v1102 = &v6[v752]; + int32_t *v1111 = &v6[v761]; + int32_t *v1120 = &v6[v770]; + int32_t *v1129 = &v6[v779]; + int32_t *v1138 = &v6[v788]; + int32_t *v1147 = &v6[v797]; + int32_t *v1156 = &v6[v806]; + int32_t *v1165 = &v6[v815]; + int32_t *v1174 = &v6[v824]; + int32_t *v1183 = &v6[v833]; + int32_t *v1192 = &v6[v842]; + int32_t *v1201 = &v6[v851]; + int32_t *v1210 = &v6[v860]; + int32_t *v1219 = &v6[v869]; + svfloat32_t zero52 = svdup_n_f32(0); + svfloat32_t v52 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v884, v51, 0), + v884, v51, 90); + svfloat32_t v58 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v57])); + svfloat32_t v93 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v92])); + svfloat32_t v100 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v99])); + svfloat32_t v135 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v134])); + svfloat32_t v142 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v141])); + svfloat32_t v177 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v176])); + svfloat32_t v184 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v183])); + svfloat32_t v219 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v218])); + svfloat32_t v226 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v225])); + svfloat32_t v261 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v260])); + svfloat32_t v268 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v267])); + svfloat32_t v303 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v302])); + svfloat32_t v310 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v309])); + svfloat32_t v345 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v344])); + svfloat32_t v352 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v351])); + svfloat32_t v894 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v892), v1030)); + svfloat32_t v904 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v902), v1030)); + svfloat32_t v913 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v911), v1030)); + svfloat32_t v922 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v920), v1030)); + svfloat32_t v931 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v929), v1030)); + svfloat32_t v940 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v938), v1030)); + svfloat32_t v949 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v947), v1030)); + svfloat32_t v958 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v956), v1030)); + svfloat32_t v967 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v965), v1030)); + svfloat32_t v976 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v974), v1030)); + svfloat32_t v985 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v983), v1030)); + svfloat32_t v994 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v992), v1030)); + svfloat32_t v1003 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1001), v1030)); + svfloat32_t v1012 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1010), v1030)); + svfloat32_t v1021 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1019), v1030)); + svfloat32_t zero59 = svdup_n_f32(0); + svfloat32_t v59 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v894, v58, 0), + v894, v58, 90); + svfloat32_t zero94 = svdup_n_f32(0); + svfloat32_t v94 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v904, v93, 0), + v904, v93, 90); + svfloat32_t zero101 = svdup_n_f32(0); + svfloat32_t v101 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v913, v100, 0), + v913, v100, 90); + svfloat32_t zero136 = svdup_n_f32(0); + svfloat32_t v136 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero136, v922, v135, 0), + v922, v135, 90); + svfloat32_t zero143 = svdup_n_f32(0); + svfloat32_t v143 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero143, v931, v142, 0), + v931, v142, 90); + svfloat32_t zero178 = svdup_n_f32(0); + svfloat32_t v178 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero178, v940, v177, 0), + v940, v177, 90); + svfloat32_t zero185 = svdup_n_f32(0); + svfloat32_t v185 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero185, v949, v184, 0), + v949, v184, 90); + svfloat32_t zero220 = svdup_n_f32(0); + svfloat32_t v220 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero220, v958, v219, 0), + v958, v219, 90); + svfloat32_t zero227 = svdup_n_f32(0); + svfloat32_t v227 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero227, v967, v226, 0), + v967, v226, 90); + svfloat32_t zero262 = svdup_n_f32(0); + svfloat32_t v262 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero262, v976, v261, 0), + v976, v261, 90); + svfloat32_t zero269 = svdup_n_f32(0); + svfloat32_t v269 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero269, v985, v268, 0), + v985, v268, 90); + svfloat32_t zero304 = svdup_n_f32(0); + svfloat32_t v304 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero304, v994, v303, 0), + v994, v303, 90); + svfloat32_t zero311 = svdup_n_f32(0); + svfloat32_t v311 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero311, v1003, v310, 0), v1003, + v310, 90); + svfloat32_t zero346 = svdup_n_f32(0); + svfloat32_t v346 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero346, v1012, v345, 0), v1012, + v345, 90); + svfloat32_t zero353 = svdup_n_f32(0); + svfloat32_t v353 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero353, v1021, v352, 0), v1021, + v352, 90); + svfloat32_t v354 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v355 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v356 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v357 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v358 = svadd_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v359 = svsub_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v361 = svsub_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v362 = svadd_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v363 = svsub_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v364 = svadd_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v365 = svsub_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v366 = svadd_f32_x(svptrue_b32(), v304, v311); + svfloat32_t v367 = svsub_f32_x(svptrue_b32(), v304, v311); + svfloat32_t v368 = svadd_f32_x(svptrue_b32(), v346, v353); + svfloat32_t v369 = svsub_f32_x(svptrue_b32(), v346, v353); + svfloat32_t v370 = svadd_f32_x(svptrue_b32(), v354, v362); + svfloat32_t v371 = svadd_f32_x(svptrue_b32(), v356, v364); + svfloat32_t v372 = svadd_f32_x(svptrue_b32(), v358, v366); + svfloat32_t v373 = svadd_f32_x(svptrue_b32(), v360, v368); + svfloat32_t v376 = svsub_f32_x(svptrue_b32(), v354, v362); + svfloat32_t v377 = svsub_f32_x(svptrue_b32(), v356, v364); + svfloat32_t v378 = svsub_f32_x(svptrue_b32(), v358, v366); + svfloat32_t v379 = svsub_f32_x(svptrue_b32(), v360, v368); + svfloat32_t v390 = svadd_f32_x(svptrue_b32(), v355, v359); + svfloat32_t v391 = svadd_f32_x(svptrue_b32(), v357, v361); + svfloat32_t v392 = svsub_f32_x(svptrue_b32(), v355, v359); + svfloat32_t v393 = svsub_f32_x(svptrue_b32(), v369, v365); + svfloat32_t v394 = svadd_f32_x(svptrue_b32(), v363, v367); + svfloat32_t v395 = svadd_f32_x(svptrue_b32(), v365, v369); + svfloat32_t v396 = svsub_f32_x(svptrue_b32(), v363, v367); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v357, v361); + svfloat32_t v410 = svadd_f32_x(svptrue_b32(), v355, v363); + svfloat32_t v411 = svadd_f32_x(svptrue_b32(), v361, v369); + svfloat32_t v374 = svadd_f32_x(svptrue_b32(), v370, v372); + svfloat32_t v375 = svadd_f32_x(svptrue_b32(), v371, v373); + svfloat32_t v380 = svsub_f32_x(svptrue_b32(), v370, v372); + svfloat32_t v381 = svsub_f32_x(svptrue_b32(), v371, v373); + svfloat32_t v384 = svadd_f32_x(svptrue_b32(), v377, v379); + svfloat32_t v385 = svadd_f32_x(svptrue_b32(), v376, v378); + svfloat32_t v387 = svsub_f32_x(svptrue_b32(), v378, v379); + svfloat32_t v388 = svsub_f32_x(svptrue_b32(), v376, v377); + svfloat32_t v398 = svadd_f32_x(svptrue_b32(), v390, v391); + svfloat32_t v399 = svadd_f32_x(svptrue_b32(), v394, v395); + svfloat32_t v401 = svsub_f32_x(svptrue_b32(), v390, v391); + svfloat32_t v402 = svsub_f32_x(svptrue_b32(), v394, v395); + svfloat32_t v404 = svadd_f32_x(svptrue_b32(), v392, v393); + svfloat32_t v405 = svadd_f32_x(svptrue_b32(), v396, v397); + svfloat32_t v407 = svsub_f32_x(svptrue_b32(), v392, v393); + svfloat32_t v408 = svsub_f32_x(svptrue_b32(), v396, v397); + svfloat32_t v447 = svmul_f32_x(svptrue_b32(), v378, v1035); + svfloat32_t zero614 = svdup_n_f32(0); + svfloat32_t v614 = svcmla_f32_x(pred_full, zero614, v1062, v411, 90); + svfloat32_t v382 = svadd_f32_x(svptrue_b32(), v374, v375); + svfloat32_t v383 = svsub_f32_x(svptrue_b32(), v374, v375); + svfloat32_t v386 = svsub_f32_x(svptrue_b32(), v385, v384); + svfloat32_t v389 = svadd_f32_x(svptrue_b32(), v380, v381); + svfloat32_t v400 = svadd_f32_x(svptrue_b32(), v398, v399); + svfloat32_t v403 = svadd_f32_x(svptrue_b32(), v401, v402); + svfloat32_t v406 = svadd_f32_x(svptrue_b32(), v404, v405); + svfloat32_t v409 = svadd_f32_x(svptrue_b32(), v407, v408); + svfloat32_t v412 = svsub_f32_x(svptrue_b32(), v405, v399); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v398, v404); + svfloat32_t v457 = svmul_f32_x(svptrue_b32(), v380, v1037); + svfloat32_t v462 = svmul_f32_x(svptrue_b32(), v381, v1038); + svfloat32_t v492 = svmul_f32_x(svptrue_b32(), v387, v1044); + svfloat32_t v497 = svmul_f32_x(svptrue_b32(), v388, v1045); + svfloat32_t v413 = svadd_f32_x(svptrue_b32(), v412, v355); + svfloat32_t v416 = svadd_f32_x(svptrue_b32(), v415, v361); + svfloat32_t v427 = svadd_f32_x(svptrue_b32(), v1031, v382); + svfloat32_t v487 = svmul_f32_x(svptrue_b32(), v386, v1043); + svfloat32_t zero523 = svdup_n_f32(0); + svfloat32_t v523 = svcmla_f32_x(pred_full, zero523, v1049, v400, 90); + svfloat32_t zero544 = svdup_n_f32(0); + svfloat32_t v544 = svcmla_f32_x(pred_full, zero544, v1052, v403, 90); + svfloat32_t zero565 = svdup_n_f32(0); + svfloat32_t v565 = svcmla_f32_x(pred_full, zero565, v1055, v406, 90); + svfloat32_t zero586 = svdup_n_f32(0); + svfloat32_t v586 = svcmla_f32_x(pred_full, zero586, v1058, v409, 90); + svfloat32_t v652 = svmla_f32_x(pred_full, v492, v379, v1036); + svfloat32_t v653 = svnmls_f32_x(pred_full, v447, v387, v1044); + svfloat32_t v654 = svmla_f32_x(pred_full, v497, v377, v1034); + svfloat32_t v655 = svnmls_f32_x(pred_full, v497, v376, v1033); + svfloat32_t v414 = svsub_f32_x(svptrue_b32(), v413, v411); + svfloat32_t v417 = svadd_f32_x(svptrue_b32(), v416, v363); + svfloat32_t v650 = svmla_f32_x(pred_full, v487, v384, v1041); + svfloat32_t v651 = svnmls_f32_x(pred_full, v487, v385, v1042); + svfloat32_t v656 = svnmls_f32_x(pred_full, v462, v389, v1046); + svfloat32_t v657 = svmla_f32_x(pred_full, v457, v389, v1046); + svfloat32_t v658 = svmla_f32_x(pred_full, v427, v382, v1039); + svfloat32_t v677 = svcmla_f32_x(pred_full, v523, v1047, v398, 90); + svfloat32_t v678 = svcmla_f32_x(pred_full, v523, v1048, v399, 90); + svfloat32_t v679 = svcmla_f32_x(pred_full, v544, v1050, v401, 90); + svfloat32_t v680 = svcmla_f32_x(pred_full, v544, v1051, v402, 90); + svfloat32_t v681 = svcmla_f32_x(pred_full, v565, v1053, v404, 90); + svfloat32_t v682 = svcmla_f32_x(pred_full, v565, v1054, v405, 90); + svfloat32_t v683 = svcmla_f32_x(pred_full, v586, v1056, v407, 90); + svfloat32_t v684 = svcmla_f32_x(pred_full, v586, v1057, v408, 90); + svint16_t v726 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v427, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v418 = svsub_f32_x(svptrue_b32(), v417, v369); + svfloat32_t zero635 = svdup_n_f32(0); + svfloat32_t v635 = svcmla_f32_x(pred_full, zero635, v1065, v414, 90); + svfloat32_t v659 = svmla_f32_x(pred_full, v658, v383, v1040); + svfloat32_t v660 = svmls_f32_x(pred_full, v658, v383, v1040); + svfloat32_t v661 = svsub_f32_x(svptrue_b32(), v650, v652); + svfloat32_t v663 = svadd_f32_x(svptrue_b32(), v651, v653); + svfloat32_t v665 = svadd_f32_x(svptrue_b32(), v650, v654); + svfloat32_t v667 = svadd_f32_x(svptrue_b32(), v651, v655); + svfloat32_t v688 = svadd_f32_x(svptrue_b32(), v677, v679); + svfloat32_t v689 = svsub_f32_x(svptrue_b32(), v677, v679); + svfloat32_t v690 = svadd_f32_x(svptrue_b32(), v678, v680); + svfloat32_t v691 = svsub_f32_x(svptrue_b32(), v678, v680); + svfloat32_t v692 = svadd_f32_x(svptrue_b32(), v681, v683); + svfloat32_t v693 = svsub_f32_x(svptrue_b32(), v683, v681); + svfloat32_t v694 = svadd_f32_x(svptrue_b32(), v682, v684); + svfloat32_t v695 = svsub_f32_x(svptrue_b32(), v684, v682); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1075), v1220, + svreinterpret_u64_s16(v726)); + svfloat32_t v419 = svadd_f32_x(svptrue_b32(), v414, v418); + svfloat32_t zero642 = svdup_n_f32(0); + svfloat32_t v642 = svcmla_f32_x(pred_full, zero642, v1066, v418, 90); + svfloat32_t v662 = svadd_f32_x(svptrue_b32(), v656, v659); + svfloat32_t v664 = svadd_f32_x(svptrue_b32(), v657, v660); + svfloat32_t v666 = svsub_f32_x(svptrue_b32(), v659, v656); + svfloat32_t v668 = svsub_f32_x(svptrue_b32(), v660, v657); + svfloat32_t v705 = svadd_f32_x(svptrue_b32(), v690, v694); + svfloat32_t v707 = svadd_f32_x(svptrue_b32(), v689, v695); + svfloat32_t v709 = svsub_f32_x(svptrue_b32(), v688, v692); + svfloat32_t v711 = svsub_f32_x(svptrue_b32(), v695, v689); + svfloat32_t v713 = svadd_f32_x(svptrue_b32(), v688, v692); + svfloat32_t v716 = svsub_f32_x(svptrue_b32(), v693, v691); + svfloat32_t v719 = svsub_f32_x(svptrue_b32(), v694, v690); + svfloat32_t v722 = svadd_f32_x(svptrue_b32(), v691, v693); + svfloat32_t v669 = svadd_f32_x(svptrue_b32(), v661, v662); + svfloat32_t v670 = svadd_f32_x(svptrue_b32(), v663, v664); + svfloat32_t v671 = svadd_f32_x(svptrue_b32(), v665, v666); + svfloat32_t v672 = svadd_f32_x(svptrue_b32(), v667, v668); + svfloat32_t v673 = svsub_f32_x(svptrue_b32(), v662, v661); + svfloat32_t v674 = svsub_f32_x(svptrue_b32(), v664, v663); + svfloat32_t v675 = svsub_f32_x(svptrue_b32(), v666, v665); + svfloat32_t v676 = svsub_f32_x(svptrue_b32(), v668, v667); + svfloat32_t v696 = svsub_f32_x(svptrue_b32(), v635, v642); + svfloat32_t v685 = svcmla_f32_x(pred_full, v642, v1067, v419, 90); + svfloat32_t v698 = svadd_f32_x(svptrue_b32(), v696, v696); + svfloat32_t v723 = svsub_f32_x(svptrue_b32(), v722, v696); + svfloat32_t v686 = svcmla_f32_x(pred_full, v685, v1059, v410, 90); + svfloat32_t v699 = svsub_f32_x(svptrue_b32(), v614, v698); + svfloat32_t v702 = svadd_f32_x(svptrue_b32(), v685, v685); + svfloat32_t v720 = svadd_f32_x(svptrue_b32(), v719, v698); + svfloat32_t v768 = svadd_f32_x(svptrue_b32(), v676, v723); + svfloat32_t v777 = svsub_f32_x(svptrue_b32(), v676, v723); + svfloat32_t v687 = svcmla_f32_x(pred_full, v686, v1060, v355, 90); + svfloat32_t v697 = svcmla_f32_x(pred_full, v686, v1061, v363, 90); + svfloat32_t v700 = svcmla_f32_x(pred_full, v699, v1063, v361, 90); + svfloat32_t v701 = svcmla_f32_x(pred_full, v699, v1064, v369, 90); + svfloat32_t v703 = svadd_f32_x(svptrue_b32(), v702, v702); + svfloat32_t v704 = svadd_f32_x(svptrue_b32(), v696, v702); + svfloat32_t v710 = svadd_f32_x(svptrue_b32(), v709, v702); + svfloat32_t v721 = svadd_f32_x(svptrue_b32(), v720, v702); + svint16_t v771 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v768, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v780 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v777, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v706 = svadd_f32_x(svptrue_b32(), v705, v697); + svfloat32_t v708 = svadd_f32_x(svptrue_b32(), v707, v700); + svfloat32_t v712 = svsub_f32_x(svptrue_b32(), v711, v704); + svfloat32_t v714 = svadd_f32_x(svptrue_b32(), v713, v687); + svfloat32_t v717 = svsub_f32_x(svptrue_b32(), v716, v701); + svfloat32_t v750 = svadd_f32_x(svptrue_b32(), v671, v710); + svfloat32_t v759 = svsub_f32_x(svptrue_b32(), v671, v710); + svfloat32_t v858 = svadd_f32_x(svptrue_b32(), v675, v721); + svfloat32_t v867 = svsub_f32_x(svptrue_b32(), v675, v721); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1120), v1220, + svreinterpret_u64_s16(v771)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1129), v1220, + svreinterpret_u64_s16(v780)); + svfloat32_t v715 = svadd_f32_x(svptrue_b32(), v714, v696); + svfloat32_t v718 = svadd_f32_x(svptrue_b32(), v717, v703); + svfloat32_t v732 = svadd_f32_x(svptrue_b32(), v669, v706); + svfloat32_t v741 = svsub_f32_x(svptrue_b32(), v669, v706); + svint16_t v753 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v750, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v762 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v759, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v804 = svadd_f32_x(svptrue_b32(), v672, v712); + svfloat32_t v813 = svsub_f32_x(svptrue_b32(), v672, v712); + svfloat32_t v822 = svadd_f32_x(svptrue_b32(), v670, v708); + svfloat32_t v831 = svsub_f32_x(svptrue_b32(), v670, v708); + svint16_t v861 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v858, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v870 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v867, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v735 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v732, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v744 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v741, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v786 = svadd_f32_x(svptrue_b32(), v673, v715); + svfloat32_t v795 = svsub_f32_x(svptrue_b32(), v673, v715); + svint16_t v807 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v804, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v816 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v813, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v825 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v822, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v834 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v831, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v840 = svadd_f32_x(svptrue_b32(), v674, v718); + svfloat32_t v849 = svsub_f32_x(svptrue_b32(), v674, v718); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1102), v1220, + svreinterpret_u64_s16(v753)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1111), v1220, + svreinterpret_u64_s16(v762)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1210), v1220, + svreinterpret_u64_s16(v861)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1219), v1220, + svreinterpret_u64_s16(v870)); + svint16_t v789 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v786, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v798 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v795, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v843 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v840, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v852 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v849, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1084), v1220, + svreinterpret_u64_s16(v735)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1093), v1220, + svreinterpret_u64_s16(v744)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1156), v1220, + svreinterpret_u64_s16(v807)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1165), v1220, + svreinterpret_u64_s16(v816)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1174), v1220, + svreinterpret_u64_s16(v825)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1183), v1220, + svreinterpret_u64_s16(v834)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1138), v1220, + svreinterpret_u64_s16(v789)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1147), v1220, + svreinterpret_u64_s16(v798)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1192), v1220, + svreinterpret_u64_s16(v843)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1201), v1220, + svreinterpret_u64_s16(v852)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs18(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v317 = v5[istride]; + float v695 = -5.0000000000000000e-01F; + float v706 = -1.4999999999999998e+00F; + float v709 = 8.6602540378443871e-01F; + float v710 = -8.6602540378443871e-01F; + float v717 = 7.6604444311897801e-01F; + float v721 = 9.3969262078590832e-01F; + float v725 = -1.7364817766693039e-01F; + float v728 = 6.4278760968653925e-01F; + float v729 = -6.4278760968653925e-01F; + float v735 = -3.4202014332566888e-01F; + float v736 = 3.4202014332566888e-01F; + float v742 = 9.8480775301220802e-01F; + float v743 = -9.8480775301220802e-01F; + float32x2_t v745 = (float32x2_t){v4, v4}; + float32x2_t v349 = vtrn1_f32(v317, v317); + float32x2_t v350 = vtrn2_f32(v317, v317); + float32x2_t v547 = v5[0]; + float32x2_t v696 = (float32x2_t){v695, v695}; + float32x2_t v707 = (float32x2_t){v706, v706}; + float32x2_t v711 = (float32x2_t){v709, v710}; + float32x2_t v718 = (float32x2_t){v717, v717}; + float32x2_t v722 = (float32x2_t){v721, v721}; + float32x2_t v726 = (float32x2_t){v725, v725}; + float32x2_t v730 = (float32x2_t){v728, v729}; + float32x2_t v737 = (float32x2_t){v735, v736}; + float32x2_t v744 = (float32x2_t){v742, v743}; + float32x2_t v20 = v5[istride * 9]; + int64_t v37 = 16 + j * 34; + float32x2_t v51 = v5[istride * 2]; + float32x2_t v69 = v5[istride * 11]; + int64_t v86 = 2 + j * 34; + int64_t v99 = 20 + j * 34; + float32x2_t v113 = v5[istride * 4]; + float32x2_t v131 = v5[istride * 13]; + int64_t v148 = 6 + j * 34; + int64_t v161 = 24 + j * 34; + float32x2_t v175 = v5[istride * 6]; + float32x2_t v193 = v5[istride * 15]; + int64_t v210 = 10 + j * 34; + int64_t v223 = 28 + j * 34; + float32x2_t v237 = v5[istride * 8]; + float32x2_t v255 = v5[istride * 17]; + int64_t v272 = 14 + j * 34; + int64_t v285 = 32 + j * 34; + float32x2_t v299 = v5[istride * 10]; + int64_t v334 = 18 + j * 34; + float32x2_t v348 = v7[j * 34]; + int64_t v352 = j * 34 + 1; + float32x2_t v361 = v5[istride * 12]; + float32x2_t v379 = v5[istride * 3]; + int64_t v396 = 22 + j * 34; + int64_t v409 = 4 + j * 34; + float32x2_t v423 = v5[istride * 14]; + float32x2_t v441 = v5[istride * 5]; + int64_t v458 = 26 + j * 34; + int64_t v471 = 8 + j * 34; + float32x2_t v485 = v5[istride * 16]; + float32x2_t v503 = v5[istride * 7]; + int64_t v520 = 30 + j * 34; + int64_t v533 = 12 + j * 34; + float32x2_t v713 = vmul_f32(v745, v711); + float32x2_t v732 = vmul_f32(v745, v730); + float32x2_t v739 = vmul_f32(v745, v737); + float32x2_t v746 = vmul_f32(v745, v744); + float32x2_t v38 = v7[v37]; + float32x2_t v39 = vtrn1_f32(v20, v20); + float32x2_t v40 = vtrn2_f32(v20, v20); + int64_t v42 = v37 + 1; + float32x2_t v87 = v7[v86]; + float32x2_t v88 = vtrn1_f32(v51, v51); + float32x2_t v89 = vtrn2_f32(v51, v51); + int64_t v91 = v86 + 1; + float32x2_t v100 = v7[v99]; + float32x2_t v101 = vtrn1_f32(v69, v69); + float32x2_t v102 = vtrn2_f32(v69, v69); + int64_t v104 = v99 + 1; + float32x2_t v149 = v7[v148]; + float32x2_t v150 = vtrn1_f32(v113, v113); + float32x2_t v151 = vtrn2_f32(v113, v113); + int64_t v153 = v148 + 1; + float32x2_t v162 = v7[v161]; + float32x2_t v163 = vtrn1_f32(v131, v131); + float32x2_t v164 = vtrn2_f32(v131, v131); + int64_t v166 = v161 + 1; + float32x2_t v211 = v7[v210]; + float32x2_t v212 = vtrn1_f32(v175, v175); + float32x2_t v213 = vtrn2_f32(v175, v175); + int64_t v215 = v210 + 1; + float32x2_t v224 = v7[v223]; + float32x2_t v225 = vtrn1_f32(v193, v193); + float32x2_t v226 = vtrn2_f32(v193, v193); + int64_t v228 = v223 + 1; + float32x2_t v273 = v7[v272]; + float32x2_t v274 = vtrn1_f32(v237, v237); + float32x2_t v275 = vtrn2_f32(v237, v237); + int64_t v277 = v272 + 1; + float32x2_t v286 = v7[v285]; + float32x2_t v287 = vtrn1_f32(v255, v255); + float32x2_t v288 = vtrn2_f32(v255, v255); + int64_t v290 = v285 + 1; + float32x2_t v335 = v7[v334]; + float32x2_t v336 = vtrn1_f32(v299, v299); + float32x2_t v337 = vtrn2_f32(v299, v299); + int64_t v339 = v334 + 1; + float32x2_t v353 = v7[v352]; + float32x2_t v354 = vmul_f32(v349, v348); + float32x2_t v397 = v7[v396]; + float32x2_t v398 = vtrn1_f32(v361, v361); + float32x2_t v399 = vtrn2_f32(v361, v361); + int64_t v401 = v396 + 1; + float32x2_t v410 = v7[v409]; + float32x2_t v411 = vtrn1_f32(v379, v379); + float32x2_t v412 = vtrn2_f32(v379, v379); + int64_t v414 = v409 + 1; + float32x2_t v459 = v7[v458]; + float32x2_t v460 = vtrn1_f32(v423, v423); + float32x2_t v461 = vtrn2_f32(v423, v423); + int64_t v463 = v458 + 1; + float32x2_t v472 = v7[v471]; + float32x2_t v473 = vtrn1_f32(v441, v441); + float32x2_t v474 = vtrn2_f32(v441, v441); + int64_t v476 = v471 + 1; + float32x2_t v521 = v7[v520]; + float32x2_t v522 = vtrn1_f32(v485, v485); + float32x2_t v523 = vtrn2_f32(v485, v485); + int64_t v525 = v520 + 1; + float32x2_t v534 = v7[v533]; + float32x2_t v535 = vtrn1_f32(v503, v503); + float32x2_t v536 = vtrn2_f32(v503, v503); + int64_t v538 = v533 + 1; + float32x2_t v43 = v7[v42]; + float32x2_t v44 = vmul_f32(v39, v38); + float32x2_t v92 = v7[v91]; + float32x2_t v93 = vmul_f32(v88, v87); + float32x2_t v105 = v7[v104]; + float32x2_t v106 = vmul_f32(v101, v100); + float32x2_t v154 = v7[v153]; + float32x2_t v155 = vmul_f32(v150, v149); + float32x2_t v167 = v7[v166]; + float32x2_t v168 = vmul_f32(v163, v162); + float32x2_t v216 = v7[v215]; + float32x2_t v217 = vmul_f32(v212, v211); + float32x2_t v229 = v7[v228]; + float32x2_t v230 = vmul_f32(v225, v224); + float32x2_t v278 = v7[v277]; + float32x2_t v279 = vmul_f32(v274, v273); + float32x2_t v291 = v7[v290]; + float32x2_t v292 = vmul_f32(v287, v286); + float32x2_t v340 = v7[v339]; + float32x2_t v341 = vmul_f32(v336, v335); + float32x2_t v402 = v7[v401]; + float32x2_t v403 = vmul_f32(v398, v397); + float32x2_t v415 = v7[v414]; + float32x2_t v416 = vmul_f32(v411, v410); + float32x2_t v464 = v7[v463]; + float32x2_t v465 = vmul_f32(v460, v459); + float32x2_t v477 = v7[v476]; + float32x2_t v478 = vmul_f32(v473, v472); + float32x2_t v526 = v7[v525]; + float32x2_t v527 = vmul_f32(v522, v521); + float32x2_t v539 = v7[v538]; + float32x2_t v540 = vmul_f32(v535, v534); + float32x2_t v356 = vfma_f32(v354, v350, v353); + float32x2_t v46 = vfma_f32(v44, v40, v43); + float32x2_t v95 = vfma_f32(v93, v89, v92); + float32x2_t v108 = vfma_f32(v106, v102, v105); + float32x2_t v157 = vfma_f32(v155, v151, v154); + float32x2_t v170 = vfma_f32(v168, v164, v167); + float32x2_t v219 = vfma_f32(v217, v213, v216); + float32x2_t v232 = vfma_f32(v230, v226, v229); + float32x2_t v281 = vfma_f32(v279, v275, v278); + float32x2_t v294 = vfma_f32(v292, v288, v291); + float32x2_t v343 = vfma_f32(v341, v337, v340); + float32x2_t v405 = vfma_f32(v403, v399, v402); + float32x2_t v418 = vfma_f32(v416, v412, v415); + float32x2_t v467 = vfma_f32(v465, v461, v464); + float32x2_t v480 = vfma_f32(v478, v474, v477); + float32x2_t v529 = vfma_f32(v527, v523, v526); + float32x2_t v542 = vfma_f32(v540, v536, v539); + float32x2_t v548 = vadd_f32(v547, v46); + float32x2_t v549 = vsub_f32(v547, v46); + float32x2_t v550 = vadd_f32(v95, v108); + float32x2_t v551 = vsub_f32(v95, v108); + float32x2_t v552 = vadd_f32(v157, v170); + float32x2_t v553 = vsub_f32(v157, v170); + float32x2_t v554 = vadd_f32(v219, v232); + float32x2_t v555 = vsub_f32(v219, v232); + float32x2_t v556 = vadd_f32(v281, v294); + float32x2_t v557 = vsub_f32(v281, v294); + float32x2_t v558 = vadd_f32(v343, v356); + float32x2_t v559 = vsub_f32(v343, v356); + float32x2_t v560 = vadd_f32(v405, v418); + float32x2_t v561 = vsub_f32(v405, v418); + float32x2_t v562 = vadd_f32(v467, v480); + float32x2_t v563 = vsub_f32(v467, v480); + float32x2_t v564 = vadd_f32(v529, v542); + float32x2_t v565 = vsub_f32(v529, v542); + float32x2_t v566 = vadd_f32(v550, v564); + float32x2_t v567 = vsub_f32(v550, v564); + float32x2_t v568 = vadd_f32(v562, v552); + float32x2_t v569 = vsub_f32(v562, v552); + float32x2_t v570 = vadd_f32(v554, v560); + float32x2_t v571 = vsub_f32(v554, v560); + float32x2_t v572 = vadd_f32(v556, v558); + float32x2_t v573 = vsub_f32(v556, v558); + float32x2_t v670 = vadd_f32(v551, v565); + float32x2_t v671 = vsub_f32(v551, v565); + float32x2_t v672 = vadd_f32(v563, v553); + float32x2_t v673 = vsub_f32(v563, v553); + float32x2_t v674 = vadd_f32(v555, v561); + float32x2_t v675 = vsub_f32(v555, v561); + float32x2_t v676 = vadd_f32(v557, v559); + float32x2_t v677 = vsub_f32(v557, v559); + float32x2_t v574 = vadd_f32(v566, v568); + float32x2_t v578 = vadd_f32(v567, v569); + float32x2_t v580 = vsub_f32(v566, v568); + float32x2_t v581 = vsub_f32(v568, v572); + float32x2_t v582 = vsub_f32(v572, v566); + float32x2_t v583 = vsub_f32(v567, v569); + float32x2_t v584 = vsub_f32(v569, v573); + float32x2_t v585 = vsub_f32(v573, v567); + float32x2_t v604 = vmul_f32(v570, v707); + float32x2_t v610 = vrev64_f32(v571); + float32x2_t v678 = vadd_f32(v670, v672); + float32x2_t v682 = vadd_f32(v671, v673); + float32x2_t v684 = vsub_f32(v670, v672); + float32x2_t v685 = vsub_f32(v672, v676); + float32x2_t v686 = vsub_f32(v676, v670); + float32x2_t v687 = vsub_f32(v671, v673); + float32x2_t v688 = vsub_f32(v673, v677); + float32x2_t v689 = vsub_f32(v677, v671); + float32x2_t v708 = vmul_f32(v674, v707); + float32x2_t v714 = vrev64_f32(v675); + float32x2_t v575 = vadd_f32(v574, v572); + float32x2_t v579 = vadd_f32(v578, v573); + float32x2_t v611 = vmul_f32(v610, v713); + float32x2_t v615 = vmul_f32(v580, v718); + float32x2_t v619 = vmul_f32(v581, v722); + float32x2_t v623 = vmul_f32(v582, v726); + float32x2_t v629 = vrev64_f32(v583); + float32x2_t v636 = vrev64_f32(v584); + float32x2_t v643 = vrev64_f32(v585); + float32x2_t v679 = vadd_f32(v678, v676); + float32x2_t v683 = vadd_f32(v682, v677); + float32x2_t v715 = vmul_f32(v714, v713); + float32x2_t v719 = vmul_f32(v684, v718); + float32x2_t v723 = vmul_f32(v685, v722); + float32x2_t v727 = vmul_f32(v686, v726); + float32x2_t v733 = vrev64_f32(v687); + float32x2_t v740 = vrev64_f32(v688); + float32x2_t v747 = vrev64_f32(v689); + float32x2_t v576 = vadd_f32(v575, v570); + float32x2_t v593 = vmul_f32(v575, v696); + float32x2_t v599 = vrev64_f32(v579); + float32x2_t v630 = vmul_f32(v629, v732); + float32x2_t v637 = vmul_f32(v636, v739); + float32x2_t v644 = vmul_f32(v643, v746); + float32x2_t v680 = vadd_f32(v679, v674); + float32x2_t v697 = vmul_f32(v679, v696); + float32x2_t v703 = vrev64_f32(v683); + float32x2_t v734 = vmul_f32(v733, v732); + float32x2_t v741 = vmul_f32(v740, v739); + float32x2_t v748 = vmul_f32(v747, v746); + float32x2_t v577 = vadd_f32(v576, v548); + float32x2_t v600 = vmul_f32(v599, v713); + float32x2_t v645 = vadd_f32(v593, v593); + float32x2_t v658 = vadd_f32(v611, v630); + float32x2_t v660 = vsub_f32(v611, v637); + float32x2_t v662 = vsub_f32(v611, v630); + float32x2_t v681 = vadd_f32(v680, v549); + float32x2_t v704 = vmul_f32(v703, v713); + float32x2_t v749 = vadd_f32(v697, v697); + float32x2_t v762 = vadd_f32(v715, v734); + float32x2_t v764 = vsub_f32(v715, v741); + float32x2_t v766 = vsub_f32(v715, v734); + float32x2_t v646 = vadd_f32(v645, v593); + float32x2_t v650 = vadd_f32(v577, v604); + float32x2_t v659 = vadd_f32(v658, v637); + float32x2_t v661 = vadd_f32(v660, v644); + float32x2_t v663 = vsub_f32(v662, v644); + float32x2_t v750 = vadd_f32(v749, v697); + float32x2_t v754 = vadd_f32(v681, v708); + float32x2_t v763 = vadd_f32(v762, v741); + float32x2_t v765 = vadd_f32(v764, v748); + float32x2_t v767 = vsub_f32(v766, v748); + int16x4_t v776 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v577, 15), (int32x2_t){0, 0})); + int16x4_t v782 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v681, 15), (int32x2_t){0, 0})); + float32x2_t v647 = vadd_f32(v577, v646); + float32x2_t v651 = vadd_f32(v650, v645); + float32x2_t v751 = vadd_f32(v681, v750); + float32x2_t v755 = vadd_f32(v754, v749); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v776), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v782), 0); + float32x2_t v648 = vadd_f32(v647, v600); + float32x2_t v649 = vsub_f32(v647, v600); + float32x2_t v652 = vadd_f32(v651, v615); + float32x2_t v654 = vsub_f32(v651, v619); + float32x2_t v656 = vsub_f32(v651, v615); + float32x2_t v752 = vadd_f32(v751, v704); + float32x2_t v753 = vsub_f32(v751, v704); + float32x2_t v756 = vadd_f32(v755, v719); + float32x2_t v758 = vsub_f32(v755, v723); + float32x2_t v760 = vsub_f32(v755, v719); + float32x2_t v653 = vadd_f32(v652, v619); + float32x2_t v655 = vadd_f32(v654, v623); + float32x2_t v657 = vsub_f32(v656, v623); + float32x2_t v757 = vadd_f32(v756, v723); + float32x2_t v759 = vadd_f32(v758, v727); + float32x2_t v761 = vsub_f32(v760, v727); + int16x4_t v812 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v649, 15), (int32x2_t){0, 0})); + int16x4_t v818 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v753, 15), (int32x2_t){0, 0})); + int16x4_t v848 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v648, 15), (int32x2_t){0, 0})); + int16x4_t v854 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v752, 15), (int32x2_t){0, 0})); + float32x2_t v664 = vadd_f32(v653, v659); + float32x2_t v665 = vsub_f32(v653, v659); + float32x2_t v666 = vadd_f32(v655, v661); + float32x2_t v667 = vsub_f32(v655, v661); + float32x2_t v668 = vadd_f32(v657, v663); + float32x2_t v669 = vsub_f32(v657, v663); + float32x2_t v768 = vadd_f32(v757, v763); + float32x2_t v769 = vsub_f32(v757, v763); + float32x2_t v770 = vadd_f32(v759, v765); + float32x2_t v771 = vsub_f32(v759, v765); + float32x2_t v772 = vadd_f32(v761, v767); + float32x2_t v773 = vsub_f32(v761, v767); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v812), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v818), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v848), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v854), 0); + int16x4_t v788 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v665, 15), (int32x2_t){0, 0})); + int16x4_t v794 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v769, 15), (int32x2_t){0, 0})); + int16x4_t v800 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v666, 15), (int32x2_t){0, 0})); + int16x4_t v806 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v770, 15), (int32x2_t){0, 0})); + int16x4_t v824 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v669, 15), (int32x2_t){0, 0})); + int16x4_t v830 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v773, 15), (int32x2_t){0, 0})); + int16x4_t v836 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v668, 15), (int32x2_t){0, 0})); + int16x4_t v842 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v772, 15), (int32x2_t){0, 0})); + int16x4_t v860 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v667, 15), (int32x2_t){0, 0})); + int16x4_t v866 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v771, 15), (int32x2_t){0, 0})); + int16x4_t v872 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v664, 15), (int32x2_t){0, 0})); + int16x4_t v878 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v768, 15), (int32x2_t){0, 0})); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v788), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v794), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v800), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v806), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v824), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v830), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v836), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v842), 0); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v860), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v866), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v872), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v878), 0); + v5 += 1 * idist; + v6 += 1 * odist; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs18(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + int64_t v3 = odist; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * v3; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + int64_t v13 = j; + float v536 = -5.0000000000000000e-01F; + float v548 = -1.4999999999999998e+00F; + float v553 = -8.6602540378443871e-01F; + float v560 = 7.6604444311897801e-01F; + float v565 = 9.3969262078590832e-01F; + float v570 = -1.7364817766693039e-01F; + float v575 = -6.4278760968653925e-01F; + float v582 = 3.4202014332566888e-01F; + float v589 = -9.8480775301220802e-01F; + const float32x2_t *v860 = &v5[v0]; + int32_t *v985 = &v6[v2]; + int64_t v19 = v0 * 9; + int64_t v34 = v10 * 8; + int64_t v40 = v0 * 2; + int64_t v54 = v0 * 11; + int64_t v76 = v10 * 10; + int64_t v82 = v0 * 4; + int64_t v96 = v0 * 13; + int64_t v111 = v10 * 3; + int64_t v118 = v10 * 12; + int64_t v124 = v0 * 6; + int64_t v138 = v0 * 15; + int64_t v153 = v10 * 5; + int64_t v160 = v10 * 14; + int64_t v166 = v0 * 8; + int64_t v180 = v0 * 17; + int64_t v195 = v10 * 7; + int64_t v202 = v10 * 16; + int64_t v208 = v0 * 10; + int64_t v237 = v10 * 9; + int64_t v250 = v0 * 12; + int64_t v264 = v0 * 3; + int64_t v279 = v10 * 11; + int64_t v286 = v10 * 2; + int64_t v292 = v0 * 14; + int64_t v306 = v0 * 5; + int64_t v321 = v10 * 13; + int64_t v328 = v10 * 4; + int64_t v334 = v0 * 16; + int64_t v348 = v0 * 7; + int64_t v363 = v10 * 15; + int64_t v370 = v10 * 6; + int64_t v371 = v13 * 17; + float v556 = v4 * v553; + float v578 = v4 * v575; + float v585 = v4 * v582; + float v592 = v4 * v589; + int64_t v629 = v2 * 9; + int64_t v637 = v2 * 10; + int64_t v653 = v2 * 2; + int64_t v661 = v2 * 11; + int64_t v669 = v2 * 12; + int64_t v677 = v2 * 3; + int64_t v685 = v2 * 4; + int64_t v693 = v2 * 13; + int64_t v701 = v2 * 14; + int64_t v709 = v2 * 5; + int64_t v717 = v2 * 6; + int64_t v725 = v2 * 15; + int64_t v733 = v2 * 16; + int64_t v741 = v2 * 7; + int64_t v749 = v2 * 8; + int64_t v757 = v2 * 17; + const float32x2_t *v926 = &v5[0]; + svint64_t v927 = svindex_s64(0, v1); + svfloat32_t v941 = svdup_n_f32(v536); + svfloat32_t v943 = svdup_n_f32(v548); + svfloat32_t v945 = svdup_n_f32(v560); + svfloat32_t v946 = svdup_n_f32(v565); + svfloat32_t v947 = svdup_n_f32(v570); + int32_t *v958 = &v6[0]; + svint64_t v1112 = svindex_s64(0, v3); + int64_t v36 = v34 + v371; + int64_t v71 = v10 + v371; + int64_t v78 = v76 + v371; + int64_t v113 = v111 + v371; + int64_t v120 = v118 + v371; + int64_t v155 = v153 + v371; + int64_t v162 = v160 + v371; + int64_t v197 = v195 + v371; + int64_t v204 = v202 + v371; + int64_t v239 = v237 + v371; + svfloat32_t v247 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v371])); + int64_t v281 = v279 + v371; + int64_t v288 = v286 + v371; + int64_t v323 = v321 + v371; + int64_t v330 = v328 + v371; + int64_t v365 = v363 + v371; + int64_t v372 = v370 + v371; + const float32x2_t *v770 = &v5[v19]; + const float32x2_t *v779 = &v5[v40]; + const float32x2_t *v788 = &v5[v54]; + const float32x2_t *v797 = &v5[v82]; + const float32x2_t *v806 = &v5[v96]; + const float32x2_t *v815 = &v5[v124]; + const float32x2_t *v824 = &v5[v138]; + const float32x2_t *v833 = &v5[v166]; + const float32x2_t *v842 = &v5[v180]; + const float32x2_t *v851 = &v5[v208]; + svfloat32_t v862 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v860), v927)); + const float32x2_t *v871 = &v5[v250]; + const float32x2_t *v880 = &v5[v264]; + const float32x2_t *v889 = &v5[v292]; + const float32x2_t *v898 = &v5[v306]; + const float32x2_t *v907 = &v5[v334]; + const float32x2_t *v916 = &v5[v348]; + svfloat32_t v928 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v926), v927)); + svfloat32_t v944 = svdup_n_f32(v556); + svfloat32_t v948 = svdup_n_f32(v578); + svfloat32_t v949 = svdup_n_f32(v585); + svfloat32_t v950 = svdup_n_f32(v592); + int32_t *v967 = &v6[v629]; + int32_t *v976 = &v6[v637]; + int32_t *v994 = &v6[v653]; + int32_t *v1003 = &v6[v661]; + int32_t *v1012 = &v6[v669]; + int32_t *v1021 = &v6[v677]; + int32_t *v1030 = &v6[v685]; + int32_t *v1039 = &v6[v693]; + int32_t *v1048 = &v6[v701]; + int32_t *v1057 = &v6[v709]; + int32_t *v1066 = &v6[v717]; + int32_t *v1075 = &v6[v725]; + int32_t *v1084 = &v6[v733]; + int32_t *v1093 = &v6[v741]; + int32_t *v1102 = &v6[v749]; + int32_t *v1111 = &v6[v757]; + svfloat32_t v37 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v36])); + svfloat32_t v72 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v71])); + svfloat32_t v79 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v78])); + svfloat32_t v114 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v113])); + svfloat32_t v121 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v120])); + svfloat32_t v156 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v155])); + svfloat32_t v163 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v162])); + svfloat32_t v198 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v197])); + svfloat32_t v205 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v204])); + svfloat32_t v240 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v239])); + svfloat32_t zero248 = svdup_n_f32(0); + svfloat32_t v248 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero248, v862, v247, 0), + v862, v247, 90); + svfloat32_t v282 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v281])); + svfloat32_t v289 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v288])); + svfloat32_t v324 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v323])); + svfloat32_t v331 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v330])); + svfloat32_t v366 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v365])); + svfloat32_t v373 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v372])); + svfloat32_t v772 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v770), v927)); + svfloat32_t v781 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v779), v927)); + svfloat32_t v790 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v788), v927)); + svfloat32_t v799 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v797), v927)); + svfloat32_t v808 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v806), v927)); + svfloat32_t v817 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v815), v927)); + svfloat32_t v826 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v824), v927)); + svfloat32_t v835 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v833), v927)); + svfloat32_t v844 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v842), v927)); + svfloat32_t v853 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v851), v927)); + svfloat32_t v873 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v871), v927)); + svfloat32_t v882 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v880), v927)); + svfloat32_t v891 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v889), v927)); + svfloat32_t v900 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v898), v927)); + svfloat32_t v909 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v907), v927)); + svfloat32_t v918 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v916), v927)); + svfloat32_t zero38 = svdup_n_f32(0); + svfloat32_t v38 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v772, v37, 0), + v772, v37, 90); + svfloat32_t zero73 = svdup_n_f32(0); + svfloat32_t v73 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v781, v72, 0), + v781, v72, 90); + svfloat32_t zero80 = svdup_n_f32(0); + svfloat32_t v80 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v790, v79, 0), + v790, v79, 90); + svfloat32_t zero115 = svdup_n_f32(0); + svfloat32_t v115 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero115, v799, v114, 0), + v799, v114, 90); + svfloat32_t zero122 = svdup_n_f32(0); + svfloat32_t v122 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v808, v121, 0), + v808, v121, 90); + svfloat32_t zero157 = svdup_n_f32(0); + svfloat32_t v157 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v817, v156, 0), + v817, v156, 90); + svfloat32_t zero164 = svdup_n_f32(0); + svfloat32_t v164 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero164, v826, v163, 0), + v826, v163, 90); + svfloat32_t zero199 = svdup_n_f32(0); + svfloat32_t v199 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero199, v835, v198, 0), + v835, v198, 90); + svfloat32_t zero206 = svdup_n_f32(0); + svfloat32_t v206 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v844, v205, 0), + v844, v205, 90); + svfloat32_t zero241 = svdup_n_f32(0); + svfloat32_t v241 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero241, v853, v240, 0), + v853, v240, 90); + svfloat32_t zero283 = svdup_n_f32(0); + svfloat32_t v283 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero283, v873, v282, 0), + v873, v282, 90); + svfloat32_t zero290 = svdup_n_f32(0); + svfloat32_t v290 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero290, v882, v289, 0), + v882, v289, 90); + svfloat32_t zero325 = svdup_n_f32(0); + svfloat32_t v325 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero325, v891, v324, 0), + v891, v324, 90); + svfloat32_t zero332 = svdup_n_f32(0); + svfloat32_t v332 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero332, v900, v331, 0), + v900, v331, 90); + svfloat32_t zero367 = svdup_n_f32(0); + svfloat32_t v367 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero367, v909, v366, 0), + v909, v366, 90); + svfloat32_t zero374 = svdup_n_f32(0); + svfloat32_t v374 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero374, v918, v373, 0), + v918, v373, 90); + svfloat32_t v382 = svadd_f32_x(svptrue_b32(), v928, v38); + svfloat32_t v383 = svsub_f32_x(svptrue_b32(), v928, v38); + svfloat32_t v384 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v385 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v386 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v387 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v388 = svadd_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v389 = svsub_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v390 = svadd_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v391 = svsub_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v392 = svadd_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v393 = svsub_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v394 = svadd_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v395 = svsub_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v396 = svadd_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v398 = svadd_f32_x(svptrue_b32(), v367, v374); + svfloat32_t v399 = svsub_f32_x(svptrue_b32(), v367, v374); + svfloat32_t v400 = svadd_f32_x(svptrue_b32(), v384, v398); + svfloat32_t v401 = svsub_f32_x(svptrue_b32(), v384, v398); + svfloat32_t v402 = svadd_f32_x(svptrue_b32(), v396, v386); + svfloat32_t v403 = svsub_f32_x(svptrue_b32(), v396, v386); + svfloat32_t v404 = svadd_f32_x(svptrue_b32(), v388, v394); + svfloat32_t v405 = svsub_f32_x(svptrue_b32(), v388, v394); + svfloat32_t v406 = svadd_f32_x(svptrue_b32(), v390, v392); + svfloat32_t v407 = svsub_f32_x(svptrue_b32(), v390, v392); + svfloat32_t v510 = svadd_f32_x(svptrue_b32(), v385, v399); + svfloat32_t v511 = svsub_f32_x(svptrue_b32(), v385, v399); + svfloat32_t v512 = svadd_f32_x(svptrue_b32(), v397, v387); + svfloat32_t v513 = svsub_f32_x(svptrue_b32(), v397, v387); + svfloat32_t v514 = svadd_f32_x(svptrue_b32(), v389, v395); + svfloat32_t v515 = svsub_f32_x(svptrue_b32(), v389, v395); + svfloat32_t v516 = svadd_f32_x(svptrue_b32(), v391, v393); + svfloat32_t v517 = svsub_f32_x(svptrue_b32(), v391, v393); + svfloat32_t v408 = svadd_f32_x(svptrue_b32(), v400, v402); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v401, v403); + svfloat32_t v414 = svsub_f32_x(svptrue_b32(), v400, v402); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v402, v406); + svfloat32_t v416 = svsub_f32_x(svptrue_b32(), v406, v400); + svfloat32_t v417 = svsub_f32_x(svptrue_b32(), v401, v403); + svfloat32_t v418 = svsub_f32_x(svptrue_b32(), v403, v407); + svfloat32_t v419 = svsub_f32_x(svptrue_b32(), v407, v401); + svfloat32_t zero448 = svdup_n_f32(0); + svfloat32_t v448 = svcmla_f32_x(pred_full, zero448, v944, v405, 90); + svfloat32_t v518 = svadd_f32_x(svptrue_b32(), v510, v512); + svfloat32_t v522 = svadd_f32_x(svptrue_b32(), v511, v513); + svfloat32_t v524 = svsub_f32_x(svptrue_b32(), v510, v512); + svfloat32_t v525 = svsub_f32_x(svptrue_b32(), v512, v516); + svfloat32_t v526 = svsub_f32_x(svptrue_b32(), v516, v510); + svfloat32_t v527 = svsub_f32_x(svptrue_b32(), v511, v513); + svfloat32_t v528 = svsub_f32_x(svptrue_b32(), v513, v517); + svfloat32_t v529 = svsub_f32_x(svptrue_b32(), v517, v511); + svfloat32_t zero558 = svdup_n_f32(0); + svfloat32_t v558 = svcmla_f32_x(pred_full, zero558, v944, v515, 90); + svfloat32_t v409 = svadd_f32_x(svptrue_b32(), v408, v406); + svfloat32_t v413 = svadd_f32_x(svptrue_b32(), v412, v407); + svfloat32_t zero470 = svdup_n_f32(0); + svfloat32_t v470 = svcmla_f32_x(pred_full, zero470, v948, v417, 90); + svfloat32_t zero477 = svdup_n_f32(0); + svfloat32_t v477 = svcmla_f32_x(pred_full, zero477, v949, v418, 90); + svfloat32_t zero484 = svdup_n_f32(0); + svfloat32_t v484 = svcmla_f32_x(pred_full, zero484, v950, v419, 90); + svfloat32_t v519 = svadd_f32_x(svptrue_b32(), v518, v516); + svfloat32_t v523 = svadd_f32_x(svptrue_b32(), v522, v517); + svfloat32_t zero580 = svdup_n_f32(0); + svfloat32_t v580 = svcmla_f32_x(pred_full, zero580, v948, v527, 90); + svfloat32_t zero587 = svdup_n_f32(0); + svfloat32_t v587 = svcmla_f32_x(pred_full, zero587, v949, v528, 90); + svfloat32_t zero594 = svdup_n_f32(0); + svfloat32_t v594 = svcmla_f32_x(pred_full, zero594, v950, v529, 90); + svfloat32_t v410 = svadd_f32_x(svptrue_b32(), v409, v404); + svfloat32_t v429 = svmul_f32_x(svptrue_b32(), v409, v941); + svfloat32_t zero436 = svdup_n_f32(0); + svfloat32_t v436 = svcmla_f32_x(pred_full, zero436, v944, v413, 90); + svfloat32_t v498 = svadd_f32_x(svptrue_b32(), v448, v470); + svfloat32_t v500 = svsub_f32_x(svptrue_b32(), v448, v477); + svfloat32_t v502 = svsub_f32_x(svptrue_b32(), v448, v470); + svfloat32_t v520 = svadd_f32_x(svptrue_b32(), v519, v514); + svfloat32_t v539 = svmul_f32_x(svptrue_b32(), v519, v941); + svfloat32_t zero546 = svdup_n_f32(0); + svfloat32_t v546 = svcmla_f32_x(pred_full, zero546, v944, v523, 90); + svfloat32_t v608 = svadd_f32_x(svptrue_b32(), v558, v580); + svfloat32_t v610 = svsub_f32_x(svptrue_b32(), v558, v587); + svfloat32_t v612 = svsub_f32_x(svptrue_b32(), v558, v580); + svfloat32_t v411 = svadd_f32_x(svptrue_b32(), v410, v382); + svfloat32_t v485 = svadd_f32_x(svptrue_b32(), v429, v429); + svfloat32_t v499 = svadd_f32_x(svptrue_b32(), v498, v477); + svfloat32_t v501 = svadd_f32_x(svptrue_b32(), v500, v484); + svfloat32_t v503 = svsub_f32_x(svptrue_b32(), v502, v484); + svfloat32_t v521 = svadd_f32_x(svptrue_b32(), v520, v383); + svfloat32_t v595 = svadd_f32_x(svptrue_b32(), v539, v539); + svfloat32_t v609 = svadd_f32_x(svptrue_b32(), v608, v587); + svfloat32_t v611 = svadd_f32_x(svptrue_b32(), v610, v594); + svfloat32_t v613 = svsub_f32_x(svptrue_b32(), v612, v594); + svfloat32_t v486 = svmla_f32_x(pred_full, v485, v409, v941); + svfloat32_t v490 = svmla_f32_x(pred_full, v411, v404, v943); + svfloat32_t v596 = svmla_f32_x(pred_full, v595, v519, v941); + svfloat32_t v600 = svmla_f32_x(pred_full, v521, v514, v943); + svint16_t v622 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v411, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v630 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v521, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v487 = svadd_f32_x(svptrue_b32(), v411, v486); + svfloat32_t v491 = svadd_f32_x(svptrue_b32(), v490, v485); + svfloat32_t v597 = svadd_f32_x(svptrue_b32(), v521, v596); + svfloat32_t v601 = svadd_f32_x(svptrue_b32(), v600, v595); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v958), v1112, + svreinterpret_u64_s16(v622)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v967), v1112, + svreinterpret_u64_s16(v630)); + svfloat32_t v488 = svadd_f32_x(svptrue_b32(), v487, v436); + svfloat32_t v489 = svsub_f32_x(svptrue_b32(), v487, v436); + svfloat32_t v492 = svmla_f32_x(pred_full, v491, v414, v945); + svfloat32_t v494 = svmls_f32_x(pred_full, v491, v415, v946); + svfloat32_t v496 = svmls_f32_x(pred_full, v491, v414, v945); + svfloat32_t v598 = svadd_f32_x(svptrue_b32(), v597, v546); + svfloat32_t v599 = svsub_f32_x(svptrue_b32(), v597, v546); + svfloat32_t v602 = svmla_f32_x(pred_full, v601, v524, v945); + svfloat32_t v604 = svmls_f32_x(pred_full, v601, v525, v946); + svfloat32_t v606 = svmls_f32_x(pred_full, v601, v524, v945); + svfloat32_t v493 = svmla_f32_x(pred_full, v492, v415, v946); + svfloat32_t v495 = svmla_f32_x(pred_full, v494, v416, v947); + svfloat32_t v497 = svmls_f32_x(pred_full, v496, v416, v947); + svfloat32_t v603 = svmla_f32_x(pred_full, v602, v525, v946); + svfloat32_t v605 = svmla_f32_x(pred_full, v604, v526, v947); + svfloat32_t v607 = svmls_f32_x(pred_full, v606, v526, v947); + svint16_t v670 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v489, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v678 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v599, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v718 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v488, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v726 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v598, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v504 = svadd_f32_x(svptrue_b32(), v493, v499); + svfloat32_t v505 = svsub_f32_x(svptrue_b32(), v493, v499); + svfloat32_t v506 = svadd_f32_x(svptrue_b32(), v495, v501); + svfloat32_t v507 = svsub_f32_x(svptrue_b32(), v495, v501); + svfloat32_t v508 = svadd_f32_x(svptrue_b32(), v497, v503); + svfloat32_t v509 = svsub_f32_x(svptrue_b32(), v497, v503); + svfloat32_t v614 = svadd_f32_x(svptrue_b32(), v603, v609); + svfloat32_t v615 = svsub_f32_x(svptrue_b32(), v603, v609); + svfloat32_t v616 = svadd_f32_x(svptrue_b32(), v605, v611); + svfloat32_t v617 = svsub_f32_x(svptrue_b32(), v605, v611); + svfloat32_t v618 = svadd_f32_x(svptrue_b32(), v607, v613); + svfloat32_t v619 = svsub_f32_x(svptrue_b32(), v607, v613); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1012), v1112, + svreinterpret_u64_s16(v670)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1021), v1112, + svreinterpret_u64_s16(v678)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1066), v1112, + svreinterpret_u64_s16(v718)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1075), v1112, + svreinterpret_u64_s16(v726)); + svint16_t v638 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v505, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v646 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v615, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v654 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v506, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v662 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v616, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v686 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v509, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v694 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v619, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v702 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v508, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v710 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v618, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v734 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v507, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v742 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v617, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v750 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v504, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v758 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v614, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v976), v1112, + svreinterpret_u64_s16(v638)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v985), v1112, + svreinterpret_u64_s16(v646)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v994), v1112, + svreinterpret_u64_s16(v654)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1003), v1112, + svreinterpret_u64_s16(v662)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1030), v1112, + svreinterpret_u64_s16(v686)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1039), v1112, + svreinterpret_u64_s16(v694)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1048), v1112, + svreinterpret_u64_s16(v702)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1057), v1112, + svreinterpret_u64_s16(v710)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1084), v1112, + svreinterpret_u64_s16(v734)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1093), v1112, + svreinterpret_u64_s16(v742)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1102), v1112, + svreinterpret_u64_s16(v750)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1111), v1112, + svreinterpret_u64_s16(v758)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs19(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v20 = v5[istride]; + float v667 = -1.0555555555555556e+00F; + float v671 = 1.7752228513927079e-01F; + float v675 = -1.2820077502191529e-01F; + float v679 = 4.9321510117355499e-02F; + float v683 = 5.7611011491005903e-01F; + float v687 = -7.4996449655536279e-01F; + float v691 = -1.7385438164530381e-01F; + float v695 = -2.1729997561977314e+00F; + float v699 = -1.7021211726914738e+00F; + float v703 = 4.7087858350625778e-01F; + float v707 = -2.0239400846888440e+00F; + float v711 = 1.0551641201664090e-01F; + float v715 = 2.1294564967054850e+00F; + float v719 = -7.5087543897371167e-01F; + float v723 = 1.4812817695157160e-01F; + float v727 = 8.9900361592528333e-01F; + float v731 = -6.2148246772602778e-01F; + float v735 = -7.9869352098712687e-01F; + float v739 = -4.7339199623771833e-01F; + float v742 = -2.4216105241892630e-01F; + float v743 = 2.4216105241892630e-01F; + float v749 = -5.9368607967505101e-02F; + float v750 = 5.9368607967505101e-02F; + float v756 = 1.2578688255176201e-02F; + float v757 = -1.2578688255176201e-02F; + float v763 = -4.6789919712328903e-02F; + float v764 = 4.6789919712328903e-02F; + float v770 = -9.3750121913782358e-01F; + float v771 = 9.3750121913782358e-01F; + float v777 = -5.0111537043352902e-02F; + float v778 = 5.0111537043352902e-02F; + float v784 = -9.8761275618117661e-01F; + float v785 = 9.8761275618117661e-01F; + float v791 = -1.1745786501205959e+00F; + float v792 = 1.1745786501205959e+00F; + float v798 = 1.1114482296234993e+00F; + float v799 = -1.1114482296234993e+00F; + float v805 = 2.2860268797440955e+00F; + float v806 = -2.2860268797440955e+00F; + float v812 = 2.6420523257930939e-01F; + float v813 = -2.6420523257930939e-01F; + float v819 = 2.1981792779352136e+00F; + float v820 = -2.1981792779352136e+00F; + float v826 = 1.9339740453559042e+00F; + float v827 = -1.9339740453559042e+00F; + float v833 = -7.4825847091254893e-01F; + float v834 = 7.4825847091254893e-01F; + float v840 = -4.7820835642768872e-01F; + float v841 = 4.7820835642768872e-01F; + float v847 = 2.7005011448486022e-01F; + float v848 = -2.7005011448486022e-01F; + float v854 = -3.4642356159542270e-01F; + float v855 = 3.4642356159542270e-01F; + float v861 = -8.3485429360688279e-01F; + float v862 = 8.3485429360688279e-01F; + float v868 = -3.9375928506743518e-01F; + float v869 = 3.9375928506743518e-01F; + float32x2_t v871 = (float32x2_t){v4, v4}; + float32x2_t v57 = vtrn1_f32(v20, v20); + float32x2_t v58 = vtrn2_f32(v20, v20); + float32x2_t v612 = v5[0]; + float32x2_t v668 = (float32x2_t){v667, v667}; + float32x2_t v672 = (float32x2_t){v671, v671}; + float32x2_t v676 = (float32x2_t){v675, v675}; + float32x2_t v680 = (float32x2_t){v679, v679}; + float32x2_t v684 = (float32x2_t){v683, v683}; + float32x2_t v688 = (float32x2_t){v687, v687}; + float32x2_t v692 = (float32x2_t){v691, v691}; + float32x2_t v696 = (float32x2_t){v695, v695}; + float32x2_t v700 = (float32x2_t){v699, v699}; + float32x2_t v704 = (float32x2_t){v703, v703}; + float32x2_t v708 = (float32x2_t){v707, v707}; + float32x2_t v712 = (float32x2_t){v711, v711}; + float32x2_t v716 = (float32x2_t){v715, v715}; + float32x2_t v720 = (float32x2_t){v719, v719}; + float32x2_t v724 = (float32x2_t){v723, v723}; + float32x2_t v728 = (float32x2_t){v727, v727}; + float32x2_t v732 = (float32x2_t){v731, v731}; + float32x2_t v736 = (float32x2_t){v735, v735}; + float32x2_t v740 = (float32x2_t){v739, v739}; + float32x2_t v744 = (float32x2_t){v742, v743}; + float32x2_t v751 = (float32x2_t){v749, v750}; + float32x2_t v758 = (float32x2_t){v756, v757}; + float32x2_t v765 = (float32x2_t){v763, v764}; + float32x2_t v772 = (float32x2_t){v770, v771}; + float32x2_t v779 = (float32x2_t){v777, v778}; + float32x2_t v786 = (float32x2_t){v784, v785}; + float32x2_t v793 = (float32x2_t){v791, v792}; + float32x2_t v800 = (float32x2_t){v798, v799}; + float32x2_t v807 = (float32x2_t){v805, v806}; + float32x2_t v814 = (float32x2_t){v812, v813}; + float32x2_t v821 = (float32x2_t){v819, v820}; + float32x2_t v828 = (float32x2_t){v826, v827}; + float32x2_t v835 = (float32x2_t){v833, v834}; + float32x2_t v842 = (float32x2_t){v840, v841}; + float32x2_t v849 = (float32x2_t){v847, v848}; + float32x2_t v856 = (float32x2_t){v854, v855}; + float32x2_t v863 = (float32x2_t){v861, v862}; + float32x2_t v870 = (float32x2_t){v868, v869}; + float32x2_t v38 = v5[istride * 18]; + float32x2_t v56 = v7[j * 36]; + int64_t v60 = j * 36 + 1; + int64_t v68 = 34 + j * 36; + float32x2_t v82 = v5[istride * 2]; + float32x2_t v100 = v5[istride * 17]; + int64_t v117 = 32 + j * 36; + int64_t v130 = 2 + j * 36; + float32x2_t v144 = v5[istride * 4]; + float32x2_t v162 = v5[istride * 15]; + int64_t v179 = 6 + j * 36; + int64_t v192 = 28 + j * 36; + float32x2_t v206 = v5[istride * 8]; + float32x2_t v224 = v5[istride * 11]; + int64_t v241 = 20 + j * 36; + int64_t v254 = 14 + j * 36; + float32x2_t v268 = v5[istride * 16]; + float32x2_t v286 = v5[istride * 3]; + int64_t v303 = 30 + j * 36; + int64_t v316 = 4 + j * 36; + float32x2_t v330 = v5[istride * 13]; + float32x2_t v348 = v5[istride * 6]; + int64_t v365 = 10 + j * 36; + int64_t v378 = 24 + j * 36; + float32x2_t v392 = v5[istride * 7]; + float32x2_t v410 = v5[istride * 12]; + int64_t v427 = 12 + j * 36; + int64_t v440 = 22 + j * 36; + float32x2_t v454 = v5[istride * 14]; + float32x2_t v472 = v5[istride * 5]; + int64_t v489 = 8 + j * 36; + int64_t v502 = 26 + j * 36; + float32x2_t v516 = v5[istride * 9]; + float32x2_t v534 = v5[istride * 10]; + int64_t v551 = 16 + j * 36; + int64_t v564 = 18 + j * 36; + float32x2_t v746 = vmul_f32(v871, v744); + float32x2_t v753 = vmul_f32(v871, v751); + float32x2_t v760 = vmul_f32(v871, v758); + float32x2_t v767 = vmul_f32(v871, v765); + float32x2_t v774 = vmul_f32(v871, v772); + float32x2_t v781 = vmul_f32(v871, v779); + float32x2_t v788 = vmul_f32(v871, v786); + float32x2_t v795 = vmul_f32(v871, v793); + float32x2_t v802 = vmul_f32(v871, v800); + float32x2_t v809 = vmul_f32(v871, v807); + float32x2_t v816 = vmul_f32(v871, v814); + float32x2_t v823 = vmul_f32(v871, v821); + float32x2_t v830 = vmul_f32(v871, v828); + float32x2_t v837 = vmul_f32(v871, v835); + float32x2_t v844 = vmul_f32(v871, v842); + float32x2_t v851 = vmul_f32(v871, v849); + float32x2_t v858 = vmul_f32(v871, v856); + float32x2_t v865 = vmul_f32(v871, v863); + float32x2_t v872 = vmul_f32(v871, v870); + float32x2_t v61 = v7[v60]; + float32x2_t v62 = vmul_f32(v57, v56); + float32x2_t v69 = v7[v68]; + float32x2_t v70 = vtrn1_f32(v38, v38); + float32x2_t v71 = vtrn2_f32(v38, v38); + int64_t v73 = v68 + 1; + float32x2_t v118 = v7[v117]; + float32x2_t v119 = vtrn1_f32(v100, v100); + float32x2_t v120 = vtrn2_f32(v100, v100); + int64_t v122 = v117 + 1; + float32x2_t v131 = v7[v130]; + float32x2_t v132 = vtrn1_f32(v82, v82); + float32x2_t v133 = vtrn2_f32(v82, v82); + int64_t v135 = v130 + 1; + float32x2_t v180 = v7[v179]; + float32x2_t v181 = vtrn1_f32(v144, v144); + float32x2_t v182 = vtrn2_f32(v144, v144); + int64_t v184 = v179 + 1; + float32x2_t v193 = v7[v192]; + float32x2_t v194 = vtrn1_f32(v162, v162); + float32x2_t v195 = vtrn2_f32(v162, v162); + int64_t v197 = v192 + 1; + float32x2_t v242 = v7[v241]; + float32x2_t v243 = vtrn1_f32(v224, v224); + float32x2_t v244 = vtrn2_f32(v224, v224); + int64_t v246 = v241 + 1; + float32x2_t v255 = v7[v254]; + float32x2_t v256 = vtrn1_f32(v206, v206); + float32x2_t v257 = vtrn2_f32(v206, v206); + int64_t v259 = v254 + 1; + float32x2_t v304 = v7[v303]; + float32x2_t v305 = vtrn1_f32(v268, v268); + float32x2_t v306 = vtrn2_f32(v268, v268); + int64_t v308 = v303 + 1; + float32x2_t v317 = v7[v316]; + float32x2_t v318 = vtrn1_f32(v286, v286); + float32x2_t v319 = vtrn2_f32(v286, v286); + int64_t v321 = v316 + 1; + float32x2_t v366 = v7[v365]; + float32x2_t v367 = vtrn1_f32(v348, v348); + float32x2_t v368 = vtrn2_f32(v348, v348); + int64_t v370 = v365 + 1; + float32x2_t v379 = v7[v378]; + float32x2_t v380 = vtrn1_f32(v330, v330); + float32x2_t v381 = vtrn2_f32(v330, v330); + int64_t v383 = v378 + 1; + float32x2_t v428 = v7[v427]; + float32x2_t v429 = vtrn1_f32(v392, v392); + float32x2_t v430 = vtrn2_f32(v392, v392); + int64_t v432 = v427 + 1; + float32x2_t v441 = v7[v440]; + float32x2_t v442 = vtrn1_f32(v410, v410); + float32x2_t v443 = vtrn2_f32(v410, v410); + int64_t v445 = v440 + 1; + float32x2_t v490 = v7[v489]; + float32x2_t v491 = vtrn1_f32(v472, v472); + float32x2_t v492 = vtrn2_f32(v472, v472); + int64_t v494 = v489 + 1; + float32x2_t v503 = v7[v502]; + float32x2_t v504 = vtrn1_f32(v454, v454); + float32x2_t v505 = vtrn2_f32(v454, v454); + int64_t v507 = v502 + 1; + float32x2_t v552 = v7[v551]; + float32x2_t v553 = vtrn1_f32(v516, v516); + float32x2_t v554 = vtrn2_f32(v516, v516); + int64_t v556 = v551 + 1; + float32x2_t v565 = v7[v564]; + float32x2_t v566 = vtrn1_f32(v534, v534); + float32x2_t v567 = vtrn2_f32(v534, v534); + int64_t v569 = v564 + 1; + float32x2_t v74 = v7[v73]; + float32x2_t v75 = vmul_f32(v70, v69); + float32x2_t v123 = v7[v122]; + float32x2_t v124 = vmul_f32(v119, v118); + float32x2_t v136 = v7[v135]; + float32x2_t v137 = vmul_f32(v132, v131); + float32x2_t v185 = v7[v184]; + float32x2_t v186 = vmul_f32(v181, v180); + float32x2_t v198 = v7[v197]; + float32x2_t v199 = vmul_f32(v194, v193); + float32x2_t v247 = v7[v246]; + float32x2_t v248 = vmul_f32(v243, v242); + float32x2_t v260 = v7[v259]; + float32x2_t v261 = vmul_f32(v256, v255); + float32x2_t v309 = v7[v308]; + float32x2_t v310 = vmul_f32(v305, v304); + float32x2_t v322 = v7[v321]; + float32x2_t v323 = vmul_f32(v318, v317); + float32x2_t v371 = v7[v370]; + float32x2_t v372 = vmul_f32(v367, v366); + float32x2_t v384 = v7[v383]; + float32x2_t v385 = vmul_f32(v380, v379); + float32x2_t v433 = v7[v432]; + float32x2_t v434 = vmul_f32(v429, v428); + float32x2_t v446 = v7[v445]; + float32x2_t v447 = vmul_f32(v442, v441); + float32x2_t v495 = v7[v494]; + float32x2_t v496 = vmul_f32(v491, v490); + float32x2_t v508 = v7[v507]; + float32x2_t v509 = vmul_f32(v504, v503); + float32x2_t v557 = v7[v556]; + float32x2_t v558 = vmul_f32(v553, v552); + float32x2_t v570 = v7[v569]; + float32x2_t v571 = vmul_f32(v566, v565); + float32x2_t v64 = vfma_f32(v62, v58, v61); + float32x2_t v77 = vfma_f32(v75, v71, v74); + float32x2_t v126 = vfma_f32(v124, v120, v123); + float32x2_t v139 = vfma_f32(v137, v133, v136); + float32x2_t v188 = vfma_f32(v186, v182, v185); + float32x2_t v201 = vfma_f32(v199, v195, v198); + float32x2_t v250 = vfma_f32(v248, v244, v247); + float32x2_t v263 = vfma_f32(v261, v257, v260); + float32x2_t v312 = vfma_f32(v310, v306, v309); + float32x2_t v325 = vfma_f32(v323, v319, v322); + float32x2_t v374 = vfma_f32(v372, v368, v371); + float32x2_t v387 = vfma_f32(v385, v381, v384); + float32x2_t v436 = vfma_f32(v434, v430, v433); + float32x2_t v449 = vfma_f32(v447, v443, v446); + float32x2_t v498 = vfma_f32(v496, v492, v495); + float32x2_t v511 = vfma_f32(v509, v505, v508); + float32x2_t v560 = vfma_f32(v558, v554, v557); + float32x2_t v573 = vfma_f32(v571, v567, v570); + float32x2_t v574 = vadd_f32(v64, v77); + float32x2_t v575 = vsub_f32(v64, v77); + float32x2_t v576 = vadd_f32(v139, v126); + float32x2_t v577 = vsub_f32(v126, v139); + float32x2_t v578 = vadd_f32(v188, v201); + float32x2_t v579 = vsub_f32(v188, v201); + float32x2_t v580 = vadd_f32(v263, v250); + float32x2_t v581 = vsub_f32(v250, v263); + float32x2_t v582 = vadd_f32(v312, v325); + float32x2_t v583 = vsub_f32(v312, v325); + float32x2_t v584 = vadd_f32(v387, v374); + float32x2_t v585 = vsub_f32(v374, v387); + float32x2_t v586 = vadd_f32(v436, v449); + float32x2_t v587 = vsub_f32(v436, v449); + float32x2_t v588 = vadd_f32(v511, v498); + float32x2_t v589 = vsub_f32(v498, v511); + float32x2_t v590 = vadd_f32(v560, v573); + float32x2_t v591 = vsub_f32(v560, v573); + float32x2_t v592 = vsub_f32(v574, v586); + float32x2_t v593 = vsub_f32(v576, v588); + float32x2_t v594 = vsub_f32(v578, v590); + float32x2_t v595 = vsub_f32(v580, v586); + float32x2_t v596 = vsub_f32(v582, v588); + float32x2_t v597 = vsub_f32(v584, v590); + float32x2_t v598 = vadd_f32(v574, v580); + float32x2_t v600 = vadd_f32(v576, v582); + float32x2_t v602 = vadd_f32(v578, v584); + float32x2_t v630 = vsub_f32(v575, v587); + float32x2_t v631 = vsub_f32(v577, v589); + float32x2_t v632 = vsub_f32(v579, v591); + float32x2_t v633 = vsub_f32(v581, v587); + float32x2_t v634 = vsub_f32(v583, v589); + float32x2_t v635 = vsub_f32(v585, v591); + float32x2_t v636 = vadd_f32(v575, v581); + float32x2_t v638 = vadd_f32(v577, v583); + float32x2_t v640 = vadd_f32(v579, v585); + float32x2_t v599 = vadd_f32(v598, v586); + float32x2_t v601 = vadd_f32(v600, v588); + float32x2_t v603 = vadd_f32(v602, v590); + float32x2_t v604 = vadd_f32(v592, v594); + float32x2_t v605 = vadd_f32(v595, v597); + float32x2_t v620 = vsub_f32(v592, v595); + float32x2_t v621 = vsub_f32(v594, v597); + float32x2_t v637 = vadd_f32(v636, v587); + float32x2_t v639 = vadd_f32(v638, v589); + float32x2_t v641 = vadd_f32(v640, v591); + float32x2_t v642 = vadd_f32(v630, v632); + float32x2_t v643 = vadd_f32(v633, v635); + float32x2_t v652 = vsub_f32(v630, v633); + float32x2_t v653 = vsub_f32(v632, v635); + float32x2_t v697 = vmul_f32(v595, v696); + float32x2_t v709 = vmul_f32(v597, v708); + float32x2_t v717 = vmul_f32(v594, v716); + float32x2_t v796 = vrev64_f32(v633); + float32x2_t v810 = vrev64_f32(v630); + float32x2_t v817 = vrev64_f32(v635); + float32x2_t v831 = vrev64_f32(v632); + float32x2_t v606 = vadd_f32(v599, v601); + float32x2_t v614 = vadd_f32(v605, v596); + float32x2_t v615 = vadd_f32(v604, v593); + float32x2_t v617 = vsub_f32(v605, v596); + float32x2_t v618 = vsub_f32(v604, v593); + float32x2_t v622 = vsub_f32(v592, v621); + float32x2_t v624 = vadd_f32(v620, v597); + float32x2_t v627 = vsub_f32(v599, v603); + float32x2_t v628 = vsub_f32(v601, v603); + float32x2_t v644 = vadd_f32(v637, v639); + float32x2_t v646 = vadd_f32(v643, v634); + float32x2_t v647 = vadd_f32(v642, v631); + float32x2_t v649 = vsub_f32(v643, v634); + float32x2_t v650 = vsub_f32(v642, v631); + float32x2_t v654 = vsub_f32(v630, v653); + float32x2_t v656 = vadd_f32(v652, v635); + float32x2_t v659 = vsub_f32(v637, v641); + float32x2_t v660 = vsub_f32(v639, v641); + float32x2_t v701 = vmul_f32(v620, v700); + float32x2_t v713 = vmul_f32(v621, v712); + float32x2_t v797 = vmul_f32(v796, v795); + float32x2_t v803 = vrev64_f32(v652); + float32x2_t v818 = vmul_f32(v817, v816); + float32x2_t v824 = vrev64_f32(v653); + float32x2_t v832 = vmul_f32(v831, v830); + float32x2_t v607 = vadd_f32(v606, v603); + float32x2_t v616 = vsub_f32(v615, v614); + float32x2_t v619 = vsub_f32(v618, v617); + float32x2_t v623 = vsub_f32(v622, v596); + float32x2_t v625 = vsub_f32(v624, v593); + float32x2_t v629 = vadd_f32(v627, v628); + float32x2_t v645 = vadd_f32(v644, v641); + float32x2_t v648 = vsub_f32(v647, v646); + float32x2_t v651 = vsub_f32(v650, v649); + float32x2_t v655 = vsub_f32(v654, v634); + float32x2_t v657 = vsub_f32(v656, v631); + float32x2_t v661 = vadd_f32(v659, v660); + float32x2_t v673 = vmul_f32(v614, v672); + float32x2_t v677 = vmul_f32(v615, v676); + float32x2_t v685 = vmul_f32(v617, v684); + float32x2_t v689 = vmul_f32(v618, v688); + float32x2_t v733 = vmul_f32(v627, v732); + float32x2_t v737 = vmul_f32(v628, v736); + float32x2_t v754 = vrev64_f32(v646); + float32x2_t v761 = vrev64_f32(v647); + float32x2_t v775 = vrev64_f32(v649); + float32x2_t v782 = vrev64_f32(v650); + float32x2_t v804 = vmul_f32(v803, v802); + float32x2_t v825 = vmul_f32(v824, v823); + float32x2_t v859 = vrev64_f32(v659); + float32x2_t v866 = vrev64_f32(v660); + float32x2_t v613 = vadd_f32(v612, v607); + float32x2_t v626 = vsub_f32(v623, v625); + float32x2_t v658 = vsub_f32(v655, v657); + float32x2_t v669 = vmul_f32(v607, v668); + float32x2_t v681 = vmul_f32(v616, v680); + float32x2_t v693 = vmul_f32(v619, v692); + float32x2_t v721 = vmul_f32(v623, v720); + float32x2_t v725 = vmul_f32(v625, v724); + float32x2_t v741 = vmul_f32(v629, v740); + float32x2_t v747 = vrev64_f32(v645); + float32x2_t v755 = vmul_f32(v754, v753); + float32x2_t v762 = vmul_f32(v761, v760); + float32x2_t v768 = vrev64_f32(v648); + float32x2_t v776 = vmul_f32(v775, v774); + float32x2_t v783 = vmul_f32(v782, v781); + float32x2_t v789 = vrev64_f32(v651); + float32x2_t v838 = vrev64_f32(v655); + float32x2_t v845 = vrev64_f32(v657); + float32x2_t v860 = vmul_f32(v859, v858); + float32x2_t v867 = vmul_f32(v866, v865); + float32x2_t v873 = vrev64_f32(v661); + float32x2_t v875 = vadd_f32(v673, v677); + float32x2_t v876 = vadd_f32(v685, v689); + float32x2_t v729 = vmul_f32(v626, v728); + float32x2_t v748 = vmul_f32(v747, v746); + float32x2_t v769 = vmul_f32(v768, v767); + float32x2_t v790 = vmul_f32(v789, v788); + float32x2_t v839 = vmul_f32(v838, v837); + float32x2_t v846 = vmul_f32(v845, v844); + float32x2_t v852 = vrev64_f32(v658); + float32x2_t v874 = vmul_f32(v873, v872); + float32x2_t v878 = vadd_f32(v875, v876); + float32x2_t v879 = vadd_f32(v673, v681); + float32x2_t v880 = vadd_f32(v685, v693); + float32x2_t v897 = vsub_f32(v875, v876); + float32x2_t v899 = vsub_f32(v733, v741); + float32x2_t v900 = vsub_f32(v737, v741); + float32x2_t v901 = vadd_f32(v669, v613); + float32x2_t v906 = vadd_f32(v755, v762); + float32x2_t v907 = vadd_f32(v776, v783); + int16x4_t v962 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v613, 15), (int32x2_t){0, 0})); + float32x2_t v853 = vmul_f32(v852, v851); + float32x2_t v877 = vadd_f32(v725, v729); + float32x2_t v881 = vadd_f32(v721, v729); + float32x2_t v882 = vsub_f32(v697, v878); + float32x2_t v883 = vadd_f32(v879, v880); + float32x2_t v889 = vsub_f32(v879, v880); + float32x2_t v894 = vadd_f32(v878, v717); + float32x2_t v902 = vadd_f32(v901, v899); + float32x2_t v903 = vsub_f32(v901, v899); + float32x2_t v905 = vadd_f32(v901, v900); + float32x2_t v909 = vadd_f32(v906, v907); + float32x2_t v910 = vadd_f32(v755, v769); + float32x2_t v911 = vadd_f32(v776, v790); + float32x2_t v928 = vsub_f32(v906, v907); + float32x2_t v930 = vsub_f32(v860, v874); + float32x2_t v931 = vsub_f32(v867, v874); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v962), 0); + float32x2_t v884 = vsub_f32(v709, v881); + float32x2_t v885 = vadd_f32(v701, v877); + float32x2_t v887 = vadd_f32(v883, v713); + float32x2_t v890 = vadd_f32(v889, v877); + float32x2_t v891 = vadd_f32(v882, v883); + float32x2_t v898 = vadd_f32(v897, v881); + float32x2_t v904 = vsub_f32(v903, v900); + float32x2_t v908 = vadd_f32(v846, v853); + float32x2_t v912 = vadd_f32(v839, v853); + float32x2_t v913 = vsub_f32(v797, v909); + float32x2_t v914 = vadd_f32(v910, v911); + float32x2_t v920 = vsub_f32(v910, v911); + float32x2_t v925 = vadd_f32(v909, v832); + float32x2_t v932 = vadd_f32(v748, v930); + float32x2_t v933 = vsub_f32(v748, v930); + float32x2_t v935 = vadd_f32(v748, v931); + float32x2_t v886 = vadd_f32(v885, v882); + float32x2_t v888 = vadd_f32(v887, v884); + float32x2_t v892 = vfma_f32(v891, v592, v704); + float32x2_t v895 = vadd_f32(v894, v884); + float32x2_t v915 = vsub_f32(v818, v912); + float32x2_t v916 = vadd_f32(v804, v908); + float32x2_t v918 = vadd_f32(v914, v825); + float32x2_t v921 = vadd_f32(v920, v908); + float32x2_t v922 = vadd_f32(v913, v914); + float32x2_t v929 = vadd_f32(v928, v912); + float32x2_t v934 = vsub_f32(v933, v931); + float32x2_t v940 = vsub_f32(v898, v890); + float32x2_t v944 = vsub_f32(v905, v898); + float32x2_t v947 = vadd_f32(v890, v905); + float32x2_t v893 = vadd_f32(v892, v881); + float32x2_t v896 = vadd_f32(v895, v877); + float32x2_t v917 = vadd_f32(v916, v913); + float32x2_t v919 = vadd_f32(v918, v915); + float32x2_t v923 = vfma_f32(v922, v810, v809); + float32x2_t v926 = vadd_f32(v925, v915); + float32x2_t v941 = vadd_f32(v940, v905); + float32x2_t v945 = vadd_f32(v886, v902); + float32x2_t v946 = vadd_f32(v888, v904); + float32x2_t v952 = vsub_f32(v929, v921); + float32x2_t v956 = vsub_f32(v929, v935); + float32x2_t v959 = vadd_f32(v921, v935); + float32x2_t v924 = vadd_f32(v923, v912); + float32x2_t v927 = vadd_f32(v926, v908); + float32x2_t v936 = vsub_f32(v893, v886); + float32x2_t v938 = vsub_f32(v896, v888); + float32x2_t v942 = vsub_f32(v902, v893); + float32x2_t v943 = vsub_f32(v904, v896); + float32x2_t v953 = vadd_f32(v952, v935); + float32x2_t v957 = vadd_f32(v917, v932); + float32x2_t v958 = vadd_f32(v919, v934); + float32x2_t v980 = vsub_f32(v947, v959); + float32x2_t v987 = vadd_f32(v947, v959); + float32x2_t v994 = vadd_f32(v944, v956); + float32x2_t v1001 = vsub_f32(v944, v956); + float32x2_t v937 = vadd_f32(v936, v902); + float32x2_t v939 = vadd_f32(v938, v904); + float32x2_t v948 = vsub_f32(v924, v917); + float32x2_t v950 = vsub_f32(v927, v919); + float32x2_t v954 = vsub_f32(v932, v924); + float32x2_t v955 = vsub_f32(v934, v927); + int16x4_t v983 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v980, 15), (int32x2_t){0, 0})); + int16x4_t v990 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v987, 15), (int32x2_t){0, 0})); + int16x4_t v997 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v994, 15), (int32x2_t){0, 0})); + int16x4_t v1004 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1001, 15), (int32x2_t){0, 0})); + float32x2_t v1008 = vadd_f32(v946, v958); + float32x2_t v1015 = vsub_f32(v946, v958); + float32x2_t v1022 = vadd_f32(v941, v953); + float32x2_t v1029 = vsub_f32(v941, v953); + float32x2_t v1064 = vsub_f32(v945, v957); + float32x2_t v1071 = vadd_f32(v945, v957); + float32x2_t v949 = vadd_f32(v948, v932); + float32x2_t v951 = vadd_f32(v950, v934); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v983), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v990), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v997), 0); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v1004), 0); + int16x4_t v1011 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1008, 15), (int32x2_t){0, 0})); + int16x4_t v1018 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1015, 15), (int32x2_t){0, 0})); + int16x4_t v1025 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1022, 15), (int32x2_t){0, 0})); + int16x4_t v1032 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1029, 15), (int32x2_t){0, 0})); + float32x2_t v1036 = vadd_f32(v943, v955); + float32x2_t v1043 = vsub_f32(v943, v955); + float32x2_t v1050 = vadd_f32(v942, v954); + float32x2_t v1057 = vsub_f32(v942, v954); + int16x4_t v1067 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1064, 15), (int32x2_t){0, 0})); + int16x4_t v1074 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1071, 15), (int32x2_t){0, 0})); + float32x2_t v966 = vadd_f32(v937, v949); + float32x2_t v973 = vsub_f32(v937, v949); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v1011), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v1018), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v1025), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v1032), 0); + int16x4_t v1039 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1036, 15), (int32x2_t){0, 0})); + int16x4_t v1046 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1043, 15), (int32x2_t){0, 0})); + int16x4_t v1053 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1050, 15), (int32x2_t){0, 0})); + int16x4_t v1060 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1057, 15), (int32x2_t){0, 0})); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v1067), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v1074), 0); + float32x2_t v1078 = vadd_f32(v939, v951); + float32x2_t v1085 = vsub_f32(v939, v951); + int16x4_t v969 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v966, 15), (int32x2_t){0, 0})); + int16x4_t v976 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v973, 15), (int32x2_t){0, 0})); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v1039), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v1046), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v1053), 0); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v1060), 0); + int16x4_t v1081 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1078, 15), (int32x2_t){0, 0})); + int16x4_t v1088 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1085, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v969), 0); + v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v976), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v1081), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v1088), 0); + v5 += 1 * idist; + v6 += 1 * odist; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs19(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + int64_t v3 = odist; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * v3; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + int64_t v13 = j; + float v492 = -1.0555555555555556e+00F; + float v497 = 1.7752228513927079e-01F; + float v502 = -1.2820077502191529e-01F; + float v507 = 4.9321510117355499e-02F; + float v512 = 5.7611011491005903e-01F; + float v517 = -7.4996449655536279e-01F; + float v522 = -1.7385438164530381e-01F; + float v527 = -2.1729997561977314e+00F; + float v532 = -1.7021211726914738e+00F; + float v537 = 4.7087858350625778e-01F; + float v542 = -2.0239400846888440e+00F; + float v547 = 1.0551641201664090e-01F; + float v552 = 2.1294564967054850e+00F; + float v557 = -7.5087543897371167e-01F; + float v562 = 1.4812817695157160e-01F; + float v567 = 8.9900361592528333e-01F; + float v572 = -6.2148246772602778e-01F; + float v577 = -7.9869352098712687e-01F; + float v582 = -4.7339199623771833e-01F; + float v587 = 2.4216105241892630e-01F; + float v594 = 5.9368607967505101e-02F; + float v601 = -1.2578688255176201e-02F; + float v608 = 4.6789919712328903e-02F; + float v615 = 9.3750121913782358e-01F; + float v622 = 5.0111537043352902e-02F; + float v629 = 9.8761275618117661e-01F; + float v636 = 1.1745786501205959e+00F; + float v643 = -1.1114482296234993e+00F; + float v650 = -2.2860268797440955e+00F; + float v657 = -2.6420523257930939e-01F; + float v664 = -2.1981792779352136e+00F; + float v671 = -1.9339740453559042e+00F; + float v678 = 7.4825847091254893e-01F; + float v685 = 4.7820835642768872e-01F; + float v692 = -2.7005011448486022e-01F; + float v699 = 3.4642356159542270e-01F; + float v706 = 8.3485429360688279e-01F; + float v713 = 3.9375928506743518e-01F; + const float32x2_t *v980 = &v5[v0]; + int32_t *v1203 = &v6[v2]; + int64_t v33 = v0 * 18; + int64_t v55 = v10 * 17; + int64_t v61 = v0 * 2; + int64_t v75 = v0 * 17; + int64_t v90 = v10 * 16; + int64_t v103 = v0 * 4; + int64_t v117 = v0 * 15; + int64_t v132 = v10 * 3; + int64_t v139 = v10 * 14; + int64_t v145 = v0 * 8; + int64_t v159 = v0 * 11; + int64_t v174 = v10 * 10; + int64_t v181 = v10 * 7; + int64_t v187 = v0 * 16; + int64_t v201 = v0 * 3; + int64_t v216 = v10 * 15; + int64_t v223 = v10 * 2; + int64_t v229 = v0 * 13; + int64_t v243 = v0 * 6; + int64_t v258 = v10 * 5; + int64_t v265 = v10 * 12; + int64_t v271 = v0 * 7; + int64_t v285 = v0 * 12; + int64_t v300 = v10 * 6; + int64_t v307 = v10 * 11; + int64_t v313 = v0 * 14; + int64_t v327 = v0 * 5; + int64_t v342 = v10 * 4; + int64_t v349 = v10 * 13; + int64_t v355 = v0 * 9; + int64_t v369 = v0 * 10; + int64_t v384 = v10 * 8; + int64_t v391 = v10 * 9; + int64_t v392 = v13 * 18; + float v590 = v4 * v587; + float v597 = v4 * v594; + float v604 = v4 * v601; + float v611 = v4 * v608; + float v618 = v4 * v615; + float v625 = v4 * v622; + float v632 = v4 * v629; + float v639 = v4 * v636; + float v646 = v4 * v643; + float v653 = v4 * v650; + float v660 = v4 * v657; + float v667 = v4 * v664; + float v674 = v4 * v671; + float v681 = v4 * v678; + float v688 = v4 * v685; + float v695 = v4 * v692; + float v702 = v4 * v699; + float v709 = v4 * v706; + float v716 = v4 * v713; + int64_t v823 = v2 * 18; + int64_t v832 = v2 * 2; + int64_t v841 = v2 * 17; + int64_t v850 = v2 * 3; + int64_t v859 = v2 * 16; + int64_t v868 = v2 * 4; + int64_t v877 = v2 * 15; + int64_t v886 = v2 * 5; + int64_t v895 = v2 * 14; + int64_t v904 = v2 * 6; + int64_t v913 = v2 * 13; + int64_t v922 = v2 * 7; + int64_t v931 = v2 * 12; + int64_t v940 = v2 * 8; + int64_t v949 = v2 * 11; + int64_t v958 = v2 * 9; + int64_t v967 = v2 * 10; + const float32x2_t *v1145 = &v5[0]; + svint64_t v1146 = svindex_s64(0, v1); + svfloat32_t v1149 = svdup_n_f32(v492); + svfloat32_t v1150 = svdup_n_f32(v497); + svfloat32_t v1151 = svdup_n_f32(v502); + svfloat32_t v1152 = svdup_n_f32(v507); + svfloat32_t v1153 = svdup_n_f32(v512); + svfloat32_t v1154 = svdup_n_f32(v517); + svfloat32_t v1155 = svdup_n_f32(v522); + svfloat32_t v1156 = svdup_n_f32(v527); + svfloat32_t v1157 = svdup_n_f32(v532); + svfloat32_t v1158 = svdup_n_f32(v537); + svfloat32_t v1159 = svdup_n_f32(v542); + svfloat32_t v1160 = svdup_n_f32(v547); + svfloat32_t v1161 = svdup_n_f32(v552); + svfloat32_t v1162 = svdup_n_f32(v557); + svfloat32_t v1163 = svdup_n_f32(v562); + svfloat32_t v1164 = svdup_n_f32(v567); + svfloat32_t v1165 = svdup_n_f32(v572); + svfloat32_t v1166 = svdup_n_f32(v577); + svfloat32_t v1167 = svdup_n_f32(v582); + int32_t *v1194 = &v6[0]; + svint64_t v1357 = svindex_s64(0, v3); + svfloat32_t v51 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v392])); + int64_t v57 = v55 + v392; + int64_t v92 = v90 + v392; + int64_t v99 = v10 + v392; + int64_t v134 = v132 + v392; + int64_t v141 = v139 + v392; + int64_t v176 = v174 + v392; + int64_t v183 = v181 + v392; + int64_t v218 = v216 + v392; + int64_t v225 = v223 + v392; + int64_t v260 = v258 + v392; + int64_t v267 = v265 + v392; + int64_t v302 = v300 + v392; + int64_t v309 = v307 + v392; + int64_t v344 = v342 + v392; + int64_t v351 = v349 + v392; + int64_t v386 = v384 + v392; + int64_t v393 = v391 + v392; + svfloat32_t v982 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v980), v1146)); + const float32x2_t *v990 = &v5[v33]; + const float32x2_t *v1000 = &v5[v61]; + const float32x2_t *v1009 = &v5[v75]; + const float32x2_t *v1018 = &v5[v103]; + const float32x2_t *v1027 = &v5[v117]; + const float32x2_t *v1036 = &v5[v145]; + const float32x2_t *v1045 = &v5[v159]; + const float32x2_t *v1054 = &v5[v187]; + const float32x2_t *v1063 = &v5[v201]; + const float32x2_t *v1072 = &v5[v229]; + const float32x2_t *v1081 = &v5[v243]; + const float32x2_t *v1090 = &v5[v271]; + const float32x2_t *v1099 = &v5[v285]; + const float32x2_t *v1108 = &v5[v313]; + const float32x2_t *v1117 = &v5[v327]; + const float32x2_t *v1126 = &v5[v355]; + const float32x2_t *v1135 = &v5[v369]; + svfloat32_t v1147 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1145), v1146)); + svfloat32_t v1168 = svdup_n_f32(v590); + svfloat32_t v1169 = svdup_n_f32(v597); + svfloat32_t v1170 = svdup_n_f32(v604); + svfloat32_t v1171 = svdup_n_f32(v611); + svfloat32_t v1172 = svdup_n_f32(v618); + svfloat32_t v1173 = svdup_n_f32(v625); + svfloat32_t v1174 = svdup_n_f32(v632); + svfloat32_t v1175 = svdup_n_f32(v639); + svfloat32_t v1176 = svdup_n_f32(v646); + svfloat32_t v1177 = svdup_n_f32(v653); + svfloat32_t v1178 = svdup_n_f32(v660); + svfloat32_t v1179 = svdup_n_f32(v667); + svfloat32_t v1180 = svdup_n_f32(v674); + svfloat32_t v1181 = svdup_n_f32(v681); + svfloat32_t v1182 = svdup_n_f32(v688); + svfloat32_t v1183 = svdup_n_f32(v695); + svfloat32_t v1184 = svdup_n_f32(v702); + svfloat32_t v1185 = svdup_n_f32(v709); + svfloat32_t v1186 = svdup_n_f32(v716); + int32_t *v1212 = &v6[v823]; + int32_t *v1221 = &v6[v832]; + int32_t *v1230 = &v6[v841]; + int32_t *v1239 = &v6[v850]; + int32_t *v1248 = &v6[v859]; + int32_t *v1257 = &v6[v868]; + int32_t *v1266 = &v6[v877]; + int32_t *v1275 = &v6[v886]; + int32_t *v1284 = &v6[v895]; + int32_t *v1293 = &v6[v904]; + int32_t *v1302 = &v6[v913]; + int32_t *v1311 = &v6[v922]; + int32_t *v1320 = &v6[v931]; + int32_t *v1329 = &v6[v940]; + int32_t *v1338 = &v6[v949]; + int32_t *v1347 = &v6[v958]; + int32_t *v1356 = &v6[v967]; + svfloat32_t zero52 = svdup_n_f32(0); + svfloat32_t v52 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v982, v51, 0), + v982, v51, 90); + svfloat32_t v58 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v57])); + svfloat32_t v93 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v92])); + svfloat32_t v100 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v99])); + svfloat32_t v135 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v134])); + svfloat32_t v142 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v141])); + svfloat32_t v177 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v176])); + svfloat32_t v184 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v183])); + svfloat32_t v219 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v218])); + svfloat32_t v226 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v225])); + svfloat32_t v261 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v260])); + svfloat32_t v268 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v267])); + svfloat32_t v303 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v302])); + svfloat32_t v310 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v309])); + svfloat32_t v345 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v344])); + svfloat32_t v352 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v351])); + svfloat32_t v387 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v386])); + svfloat32_t v394 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v393])); + svfloat32_t v992 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v990), v1146)); + svfloat32_t v1002 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1000), v1146)); + svfloat32_t v1011 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1009), v1146)); + svfloat32_t v1020 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1018), v1146)); + svfloat32_t v1029 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1027), v1146)); + svfloat32_t v1038 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1036), v1146)); + svfloat32_t v1047 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1045), v1146)); + svfloat32_t v1056 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1054), v1146)); + svfloat32_t v1065 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1063), v1146)); + svfloat32_t v1074 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1072), v1146)); + svfloat32_t v1083 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1081), v1146)); + svfloat32_t v1092 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1090), v1146)); + svfloat32_t v1101 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1099), v1146)); + svfloat32_t v1110 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1108), v1146)); + svfloat32_t v1119 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1117), v1146)); + svfloat32_t v1128 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1126), v1146)); + svfloat32_t v1137 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1135), v1146)); + svfloat32_t zero59 = svdup_n_f32(0); + svfloat32_t v59 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v992, v58, 0), + v992, v58, 90); + svfloat32_t zero94 = svdup_n_f32(0); + svfloat32_t v94 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v1011, v93, 0), + v1011, v93, 90); + svfloat32_t zero101 = svdup_n_f32(0); + svfloat32_t v101 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero101, v1002, v100, 0), v1002, + v100, 90); + svfloat32_t zero136 = svdup_n_f32(0); + svfloat32_t v136 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero136, v1020, v135, 0), v1020, + v135, 90); + svfloat32_t zero143 = svdup_n_f32(0); + svfloat32_t v143 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero143, v1029, v142, 0), v1029, + v142, 90); + svfloat32_t zero178 = svdup_n_f32(0); + svfloat32_t v178 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero178, v1047, v177, 0), v1047, + v177, 90); + svfloat32_t zero185 = svdup_n_f32(0); + svfloat32_t v185 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero185, v1038, v184, 0), v1038, + v184, 90); + svfloat32_t zero220 = svdup_n_f32(0); + svfloat32_t v220 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero220, v1056, v219, 0), v1056, + v219, 90); + svfloat32_t zero227 = svdup_n_f32(0); + svfloat32_t v227 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero227, v1065, v226, 0), v1065, + v226, 90); + svfloat32_t zero262 = svdup_n_f32(0); + svfloat32_t v262 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero262, v1083, v261, 0), v1083, + v261, 90); + svfloat32_t zero269 = svdup_n_f32(0); + svfloat32_t v269 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero269, v1074, v268, 0), v1074, + v268, 90); + svfloat32_t zero304 = svdup_n_f32(0); + svfloat32_t v304 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero304, v1092, v303, 0), v1092, + v303, 90); + svfloat32_t zero311 = svdup_n_f32(0); + svfloat32_t v311 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero311, v1101, v310, 0), v1101, + v310, 90); + svfloat32_t zero346 = svdup_n_f32(0); + svfloat32_t v346 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero346, v1119, v345, 0), v1119, + v345, 90); + svfloat32_t zero353 = svdup_n_f32(0); + svfloat32_t v353 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero353, v1110, v352, 0), v1110, + v352, 90); + svfloat32_t zero388 = svdup_n_f32(0); + svfloat32_t v388 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero388, v1128, v387, 0), v1128, + v387, 90); + svfloat32_t zero395 = svdup_n_f32(0); + svfloat32_t v395 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero395, v1137, v394, 0), v1137, + v394, 90); + svfloat32_t v396 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v398 = svadd_f32_x(svptrue_b32(), v101, v94); + svfloat32_t v399 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v400 = svadd_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v401 = svsub_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v402 = svadd_f32_x(svptrue_b32(), v185, v178); + svfloat32_t v403 = svsub_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v404 = svadd_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v405 = svsub_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v406 = svadd_f32_x(svptrue_b32(), v269, v262); + svfloat32_t v407 = svsub_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v408 = svadd_f32_x(svptrue_b32(), v304, v311); + svfloat32_t v409 = svsub_f32_x(svptrue_b32(), v304, v311); + svfloat32_t v410 = svadd_f32_x(svptrue_b32(), v353, v346); + svfloat32_t v411 = svsub_f32_x(svptrue_b32(), v346, v353); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v388, v395); + svfloat32_t v413 = svsub_f32_x(svptrue_b32(), v388, v395); + svfloat32_t v414 = svsub_f32_x(svptrue_b32(), v396, v408); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v398, v410); + svfloat32_t v416 = svsub_f32_x(svptrue_b32(), v400, v412); + svfloat32_t v417 = svsub_f32_x(svptrue_b32(), v402, v408); + svfloat32_t v418 = svsub_f32_x(svptrue_b32(), v404, v410); + svfloat32_t v419 = svsub_f32_x(svptrue_b32(), v406, v412); + svfloat32_t v420 = svadd_f32_x(svptrue_b32(), v396, v402); + svfloat32_t v422 = svadd_f32_x(svptrue_b32(), v398, v404); + svfloat32_t v424 = svadd_f32_x(svptrue_b32(), v400, v406); + svfloat32_t v454 = svsub_f32_x(svptrue_b32(), v397, v409); + svfloat32_t v455 = svsub_f32_x(svptrue_b32(), v399, v411); + svfloat32_t v456 = svsub_f32_x(svptrue_b32(), v401, v413); + svfloat32_t v457 = svsub_f32_x(svptrue_b32(), v403, v409); + svfloat32_t v458 = svsub_f32_x(svptrue_b32(), v405, v411); + svfloat32_t v459 = svsub_f32_x(svptrue_b32(), v407, v413); + svfloat32_t v460 = svadd_f32_x(svptrue_b32(), v397, v403); + svfloat32_t v462 = svadd_f32_x(svptrue_b32(), v399, v405); + svfloat32_t v464 = svadd_f32_x(svptrue_b32(), v401, v407); + svfloat32_t v421 = svadd_f32_x(svptrue_b32(), v420, v408); + svfloat32_t v423 = svadd_f32_x(svptrue_b32(), v422, v410); + svfloat32_t v425 = svadd_f32_x(svptrue_b32(), v424, v412); + svfloat32_t v426 = svadd_f32_x(svptrue_b32(), v414, v416); + svfloat32_t v427 = svadd_f32_x(svptrue_b32(), v417, v419); + svfloat32_t v444 = svsub_f32_x(svptrue_b32(), v414, v417); + svfloat32_t v445 = svsub_f32_x(svptrue_b32(), v416, v419); + svfloat32_t v461 = svadd_f32_x(svptrue_b32(), v460, v409); + svfloat32_t v463 = svadd_f32_x(svptrue_b32(), v462, v411); + svfloat32_t v465 = svadd_f32_x(svptrue_b32(), v464, v413); + svfloat32_t v466 = svadd_f32_x(svptrue_b32(), v454, v456); + svfloat32_t v467 = svadd_f32_x(svptrue_b32(), v457, v459); + svfloat32_t v476 = svsub_f32_x(svptrue_b32(), v454, v457); + svfloat32_t v477 = svsub_f32_x(svptrue_b32(), v456, v459); + svfloat32_t zero641 = svdup_n_f32(0); + svfloat32_t v641 = svcmla_f32_x(pred_full, zero641, v1175, v457, 90); + svfloat32_t zero662 = svdup_n_f32(0); + svfloat32_t v662 = svcmla_f32_x(pred_full, zero662, v1178, v459, 90); + svfloat32_t v428 = svadd_f32_x(svptrue_b32(), v421, v423); + svfloat32_t v438 = svadd_f32_x(svptrue_b32(), v427, v418); + svfloat32_t v439 = svadd_f32_x(svptrue_b32(), v426, v415); + svfloat32_t v441 = svsub_f32_x(svptrue_b32(), v427, v418); + svfloat32_t v442 = svsub_f32_x(svptrue_b32(), v426, v415); + svfloat32_t v446 = svsub_f32_x(svptrue_b32(), v414, v445); + svfloat32_t v448 = svadd_f32_x(svptrue_b32(), v444, v419); + svfloat32_t v451 = svsub_f32_x(svptrue_b32(), v421, v425); + svfloat32_t v452 = svsub_f32_x(svptrue_b32(), v423, v425); + svfloat32_t v468 = svadd_f32_x(svptrue_b32(), v461, v463); + svfloat32_t v470 = svadd_f32_x(svptrue_b32(), v467, v458); + svfloat32_t v471 = svadd_f32_x(svptrue_b32(), v466, v455); + svfloat32_t v473 = svsub_f32_x(svptrue_b32(), v467, v458); + svfloat32_t v474 = svsub_f32_x(svptrue_b32(), v466, v455); + svfloat32_t v478 = svsub_f32_x(svptrue_b32(), v454, v477); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v476, v459); + svfloat32_t v483 = svsub_f32_x(svptrue_b32(), v461, v465); + svfloat32_t v484 = svsub_f32_x(svptrue_b32(), v463, v465); + svfloat32_t v429 = svadd_f32_x(svptrue_b32(), v428, v425); + svfloat32_t v440 = svsub_f32_x(svptrue_b32(), v439, v438); + svfloat32_t v443 = svsub_f32_x(svptrue_b32(), v442, v441); + svfloat32_t v447 = svsub_f32_x(svptrue_b32(), v446, v418); + svfloat32_t v449 = svsub_f32_x(svptrue_b32(), v448, v415); + svfloat32_t v453 = svadd_f32_x(svptrue_b32(), v451, v452); + svfloat32_t v469 = svadd_f32_x(svptrue_b32(), v468, v465); + svfloat32_t v472 = svsub_f32_x(svptrue_b32(), v471, v470); + svfloat32_t v475 = svsub_f32_x(svptrue_b32(), v474, v473); + svfloat32_t v479 = svsub_f32_x(svptrue_b32(), v478, v458); + svfloat32_t v481 = svsub_f32_x(svptrue_b32(), v480, v455); + svfloat32_t v485 = svadd_f32_x(svptrue_b32(), v483, v484); + svfloat32_t v505 = svmul_f32_x(svptrue_b32(), v439, v1151); + svfloat32_t v520 = svmul_f32_x(svptrue_b32(), v442, v1154); + svfloat32_t zero599 = svdup_n_f32(0); + svfloat32_t v599 = svcmla_f32_x(pred_full, zero599, v1169, v470, 90); + svfloat32_t zero620 = svdup_n_f32(0); + svfloat32_t v620 = svcmla_f32_x(pred_full, zero620, v1172, v473, 90); + svfloat32_t zero704 = svdup_n_f32(0); + svfloat32_t v704 = svcmla_f32_x(pred_full, zero704, v1184, v483, 90); + svfloat32_t zero711 = svdup_n_f32(0); + svfloat32_t v711 = svcmla_f32_x(pred_full, zero711, v1185, v484, 90); + svfloat32_t v437 = svadd_f32_x(svptrue_b32(), v1147, v429); + svfloat32_t v450 = svsub_f32_x(svptrue_b32(), v447, v449); + svfloat32_t v482 = svsub_f32_x(svptrue_b32(), v479, v481); + svfloat32_t v510 = svmul_f32_x(svptrue_b32(), v440, v1152); + svfloat32_t v525 = svmul_f32_x(svptrue_b32(), v443, v1155); + svfloat32_t v585 = svmul_f32_x(svptrue_b32(), v453, v1167); + svfloat32_t zero592 = svdup_n_f32(0); + svfloat32_t v592 = svcmla_f32_x(pred_full, zero592, v1168, v469, 90); + svfloat32_t zero718 = svdup_n_f32(0); + svfloat32_t v718 = svcmla_f32_x(pred_full, zero718, v1186, v485, 90); + svfloat32_t v719 = svmla_f32_x(pred_full, v505, v438, v1150); + svfloat32_t v720 = svmla_f32_x(pred_full, v520, v441, v1153); + svfloat32_t v750 = svcmla_f32_x(pred_full, v599, v1170, v471, 90); + svfloat32_t v751 = svcmla_f32_x(pred_full, v620, v1173, v474, 90); + svfloat32_t v570 = svmul_f32_x(svptrue_b32(), v450, v1164); + svfloat32_t zero697 = svdup_n_f32(0); + svfloat32_t v697 = svcmla_f32_x(pred_full, zero697, v1183, v482, 90); + svfloat32_t v722 = svadd_f32_x(svptrue_b32(), v719, v720); + svfloat32_t v723 = svmla_f32_x(pred_full, v510, v438, v1150); + svfloat32_t v724 = svmla_f32_x(pred_full, v525, v441, v1153); + svfloat32_t v741 = svsub_f32_x(svptrue_b32(), v719, v720); + svfloat32_t v743 = svnmls_f32_x(pred_full, v585, v451, v1165); + svfloat32_t v744 = svnmls_f32_x(pred_full, v585, v452, v1166); + svfloat32_t v745 = svmla_f32_x(pred_full, v437, v429, v1149); + svfloat32_t v753 = svadd_f32_x(svptrue_b32(), v750, v751); + svfloat32_t v754 = svcmla_f32_x(pred_full, v599, v1171, v472, 90); + svfloat32_t v755 = svcmla_f32_x(pred_full, v620, v1174, v475, 90); + svfloat32_t v772 = svsub_f32_x(svptrue_b32(), v750, v751); + svfloat32_t v774 = svsub_f32_x(svptrue_b32(), v704, v718); + svfloat32_t v775 = svsub_f32_x(svptrue_b32(), v711, v718); + svint16_t v806 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v437, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v721 = svmla_f32_x(pred_full, v570, v449, v1163); + svfloat32_t v725 = svmla_f32_x(pred_full, v570, v447, v1162); + svfloat32_t v726 = svnmls_f32_x(pred_full, v722, v417, v1156); + svfloat32_t v727 = svadd_f32_x(svptrue_b32(), v723, v724); + svfloat32_t v733 = svsub_f32_x(svptrue_b32(), v723, v724); + svfloat32_t v738 = svmla_f32_x(pred_full, v722, v416, v1161); + svfloat32_t v746 = svadd_f32_x(svptrue_b32(), v745, v743); + svfloat32_t v747 = svsub_f32_x(svptrue_b32(), v745, v743); + svfloat32_t v749 = svadd_f32_x(svptrue_b32(), v745, v744); + svfloat32_t v752 = svcmla_f32_x(pred_full, v697, v1182, v481, 90); + svfloat32_t v756 = svcmla_f32_x(pred_full, v697, v1181, v479, 90); + svfloat32_t v757 = svsub_f32_x(svptrue_b32(), v641, v753); + svfloat32_t v758 = svadd_f32_x(svptrue_b32(), v754, v755); + svfloat32_t v764 = svsub_f32_x(svptrue_b32(), v754, v755); + svfloat32_t v769 = svcmla_f32_x(pred_full, v753, v1180, v456, 90); + svfloat32_t v776 = svadd_f32_x(svptrue_b32(), v592, v774); + svfloat32_t v777 = svsub_f32_x(svptrue_b32(), v592, v774); + svfloat32_t v779 = svadd_f32_x(svptrue_b32(), v592, v775); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1194), v1357, + svreinterpret_u64_s16(v806)); + svfloat32_t v728 = svnmls_f32_x(pred_full, v725, v419, v1159); + svfloat32_t v729 = svmla_f32_x(pred_full, v721, v444, v1157); + svfloat32_t v731 = svmla_f32_x(pred_full, v727, v445, v1160); + svfloat32_t v734 = svadd_f32_x(svptrue_b32(), v733, v721); + svfloat32_t v735 = svadd_f32_x(svptrue_b32(), v726, v727); + svfloat32_t v742 = svadd_f32_x(svptrue_b32(), v741, v725); + svfloat32_t v748 = svsub_f32_x(svptrue_b32(), v747, v744); + svfloat32_t v759 = svsub_f32_x(svptrue_b32(), v662, v756); + svfloat32_t v760 = svcmla_f32_x(pred_full, v752, v1176, v476, 90); + svfloat32_t v762 = svcmla_f32_x(pred_full, v758, v1179, v477, 90); + svfloat32_t v765 = svadd_f32_x(svptrue_b32(), v764, v752); + svfloat32_t v766 = svadd_f32_x(svptrue_b32(), v757, v758); + svfloat32_t v773 = svadd_f32_x(svptrue_b32(), v772, v756); + svfloat32_t v778 = svsub_f32_x(svptrue_b32(), v777, v775); + svfloat32_t v730 = svadd_f32_x(svptrue_b32(), v729, v726); + svfloat32_t v732 = svadd_f32_x(svptrue_b32(), v731, v728); + svfloat32_t v736 = svmla_f32_x(pred_full, v735, v414, v1158); + svfloat32_t v739 = svadd_f32_x(svptrue_b32(), v738, v728); + svfloat32_t v761 = svadd_f32_x(svptrue_b32(), v760, v757); + svfloat32_t v763 = svadd_f32_x(svptrue_b32(), v762, v759); + svfloat32_t v767 = svcmla_f32_x(pred_full, v766, v1177, v454, 90); + svfloat32_t v770 = svadd_f32_x(svptrue_b32(), v769, v759); + svfloat32_t v784 = svsub_f32_x(svptrue_b32(), v742, v734); + svfloat32_t v788 = svsub_f32_x(svptrue_b32(), v749, v742); + svfloat32_t v791 = svadd_f32_x(svptrue_b32(), v734, v749); + svfloat32_t v796 = svsub_f32_x(svptrue_b32(), v773, v765); + svfloat32_t v800 = svsub_f32_x(svptrue_b32(), v773, v779); + svfloat32_t v803 = svadd_f32_x(svptrue_b32(), v765, v779); + svfloat32_t v737 = svadd_f32_x(svptrue_b32(), v736, v725); + svfloat32_t v740 = svadd_f32_x(svptrue_b32(), v739, v721); + svfloat32_t v768 = svadd_f32_x(svptrue_b32(), v767, v756); + svfloat32_t v771 = svadd_f32_x(svptrue_b32(), v770, v752); + svfloat32_t v785 = svadd_f32_x(svptrue_b32(), v784, v749); + svfloat32_t v789 = svadd_f32_x(svptrue_b32(), v730, v746); + svfloat32_t v790 = svadd_f32_x(svptrue_b32(), v732, v748); + svfloat32_t v797 = svadd_f32_x(svptrue_b32(), v796, v779); + svfloat32_t v801 = svadd_f32_x(svptrue_b32(), v761, v776); + svfloat32_t v802 = svadd_f32_x(svptrue_b32(), v763, v778); + svfloat32_t v830 = svsub_f32_x(svptrue_b32(), v791, v803); + svfloat32_t v839 = svadd_f32_x(svptrue_b32(), v791, v803); + svfloat32_t v848 = svadd_f32_x(svptrue_b32(), v788, v800); + svfloat32_t v857 = svsub_f32_x(svptrue_b32(), v788, v800); + svfloat32_t v780 = svsub_f32_x(svptrue_b32(), v737, v730); + svfloat32_t v782 = svsub_f32_x(svptrue_b32(), v740, v732); + svfloat32_t v786 = svsub_f32_x(svptrue_b32(), v746, v737); + svfloat32_t v787 = svsub_f32_x(svptrue_b32(), v748, v740); + svfloat32_t v792 = svsub_f32_x(svptrue_b32(), v768, v761); + svfloat32_t v794 = svsub_f32_x(svptrue_b32(), v771, v763); + svfloat32_t v798 = svsub_f32_x(svptrue_b32(), v776, v768); + svfloat32_t v799 = svsub_f32_x(svptrue_b32(), v778, v771); + svint16_t v833 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v830, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v842 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v839, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v851 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v848, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v860 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v857, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v866 = svadd_f32_x(svptrue_b32(), v790, v802); + svfloat32_t v875 = svsub_f32_x(svptrue_b32(), v790, v802); + svfloat32_t v884 = svadd_f32_x(svptrue_b32(), v785, v797); + svfloat32_t v893 = svsub_f32_x(svptrue_b32(), v785, v797); + svfloat32_t v938 = svsub_f32_x(svptrue_b32(), v789, v801); + svfloat32_t v947 = svadd_f32_x(svptrue_b32(), v789, v801); + svfloat32_t v781 = svadd_f32_x(svptrue_b32(), v780, v746); + svfloat32_t v783 = svadd_f32_x(svptrue_b32(), v782, v748); + svfloat32_t v793 = svadd_f32_x(svptrue_b32(), v792, v776); + svfloat32_t v795 = svadd_f32_x(svptrue_b32(), v794, v778); + svint16_t v869 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v866, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v878 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v875, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v887 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v884, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v896 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v893, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v902 = svadd_f32_x(svptrue_b32(), v787, v799); + svfloat32_t v911 = svsub_f32_x(svptrue_b32(), v787, v799); + svfloat32_t v920 = svadd_f32_x(svptrue_b32(), v786, v798); + svfloat32_t v929 = svsub_f32_x(svptrue_b32(), v786, v798); + svint16_t v941 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v938, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v950 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v947, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1221), v1357, + svreinterpret_u64_s16(v833)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1230), v1357, + svreinterpret_u64_s16(v842)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1239), v1357, + svreinterpret_u64_s16(v851)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1248), v1357, + svreinterpret_u64_s16(v860)); + svfloat32_t v812 = svadd_f32_x(svptrue_b32(), v781, v793); + svfloat32_t v821 = svsub_f32_x(svptrue_b32(), v781, v793); + svint16_t v905 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v902, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v914 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v911, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v923 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v920, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v932 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v929, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v956 = svadd_f32_x(svptrue_b32(), v783, v795); + svfloat32_t v965 = svsub_f32_x(svptrue_b32(), v783, v795); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1257), v1357, + svreinterpret_u64_s16(v869)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1266), v1357, + svreinterpret_u64_s16(v878)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1275), v1357, + svreinterpret_u64_s16(v887)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1284), v1357, + svreinterpret_u64_s16(v896)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1329), v1357, + svreinterpret_u64_s16(v941)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1338), v1357, + svreinterpret_u64_s16(v950)); + svint16_t v815 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v812, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v824 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v821, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v959 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v956, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v968 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v965, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1293), v1357, + svreinterpret_u64_s16(v905)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1302), v1357, + svreinterpret_u64_s16(v914)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1311), v1357, + svreinterpret_u64_s16(v923)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1320), v1357, + svreinterpret_u64_s16(v932)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1203), v1357, + svreinterpret_u64_s16(v815)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1212), v1357, + svreinterpret_u64_s16(v824)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1347), v1357, + svreinterpret_u64_s16(v959)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1356), v1357, + svreinterpret_u64_s16(v968)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs20(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v547 = v5[istride]; + float v760 = 1.5388417685876268e+00F; + float v767 = 5.8778525229247325e-01F; + float v774 = 3.6327126400268028e-01F; + float v798 = 1.0000000000000000e+00F; + float v799 = -1.0000000000000000e+00F; + float v805 = -1.2500000000000000e+00F; + float v806 = 1.2500000000000000e+00F; + float v812 = 5.5901699437494745e-01F; + float v813 = -5.5901699437494745e-01F; + float32x2_t v815 = (float32x2_t){v4, v4}; + float v820 = -1.5388417685876268e+00F; + float v824 = -5.8778525229247325e-01F; + float v828 = -3.6327126400268028e-01F; + float32x2_t v584 = vtrn1_f32(v547, v547); + float32x2_t v585 = vtrn2_f32(v547, v547); + float32x2_t v609 = v5[0]; + float32x2_t v754 = (float32x2_t){v805, v805}; + float32x2_t v758 = (float32x2_t){v812, v812}; + float32x2_t v762 = (float32x2_t){v760, v820}; + float32x2_t v769 = (float32x2_t){v767, v824}; + float32x2_t v776 = (float32x2_t){v774, v828}; + float32x2_t v800 = (float32x2_t){v798, v799}; + float32x2_t v807 = (float32x2_t){v805, v806}; + float32x2_t v814 = (float32x2_t){v812, v813}; + float32x2_t v821 = (float32x2_t){v820, v820}; + float32x2_t v825 = (float32x2_t){v824, v824}; + float32x2_t v829 = (float32x2_t){v828, v828}; + float32x2_t v20 = v5[istride * 10]; + int64_t v37 = 18 + j * 38; + float32x2_t v51 = v5[istride * 5]; + float32x2_t v69 = v5[istride * 15]; + int64_t v86 = 8 + j * 38; + int64_t v99 = 28 + j * 38; + float32x2_t v113 = v5[istride * 4]; + float32x2_t v131 = v5[istride * 14]; + int64_t v148 = 6 + j * 38; + int64_t v161 = 26 + j * 38; + float32x2_t v175 = v5[istride * 9]; + float32x2_t v193 = v5[istride * 19]; + int64_t v210 = 16 + j * 38; + int64_t v223 = 36 + j * 38; + float32x2_t v237 = v5[istride * 8]; + float32x2_t v255 = v5[istride * 18]; + int64_t v272 = 14 + j * 38; + int64_t v285 = 34 + j * 38; + float32x2_t v299 = v5[istride * 13]; + float32x2_t v317 = v5[istride * 3]; + int64_t v334 = 24 + j * 38; + int64_t v347 = 4 + j * 38; + float32x2_t v361 = v5[istride * 12]; + float32x2_t v379 = v5[istride * 2]; + int64_t v396 = 22 + j * 38; + int64_t v409 = 2 + j * 38; + float32x2_t v423 = v5[istride * 17]; + float32x2_t v441 = v5[istride * 7]; + int64_t v458 = 32 + j * 38; + int64_t v471 = 12 + j * 38; + float32x2_t v485 = v5[istride * 16]; + float32x2_t v503 = v5[istride * 6]; + int64_t v520 = 30 + j * 38; + int64_t v533 = 10 + j * 38; + float32x2_t v565 = v5[istride * 11]; + float32x2_t v583 = v7[j * 38]; + int64_t v587 = j * 38 + 1; + int64_t v595 = 20 + j * 38; + float32x2_t v764 = vmul_f32(v815, v762); + float32x2_t v771 = vmul_f32(v815, v769); + float32x2_t v778 = vmul_f32(v815, v776); + float32x2_t v802 = vmul_f32(v815, v800); + float32x2_t v809 = vmul_f32(v815, v807); + float32x2_t v816 = vmul_f32(v815, v814); + float32x2_t v38 = v7[v37]; + float32x2_t v39 = vtrn1_f32(v20, v20); + float32x2_t v40 = vtrn2_f32(v20, v20); + int64_t v42 = v37 + 1; + float32x2_t v87 = v7[v86]; + float32x2_t v88 = vtrn1_f32(v51, v51); + float32x2_t v89 = vtrn2_f32(v51, v51); + int64_t v91 = v86 + 1; + float32x2_t v100 = v7[v99]; + float32x2_t v101 = vtrn1_f32(v69, v69); + float32x2_t v102 = vtrn2_f32(v69, v69); + int64_t v104 = v99 + 1; + float32x2_t v149 = v7[v148]; + float32x2_t v150 = vtrn1_f32(v113, v113); + float32x2_t v151 = vtrn2_f32(v113, v113); + int64_t v153 = v148 + 1; + float32x2_t v162 = v7[v161]; + float32x2_t v163 = vtrn1_f32(v131, v131); + float32x2_t v164 = vtrn2_f32(v131, v131); + int64_t v166 = v161 + 1; + float32x2_t v211 = v7[v210]; + float32x2_t v212 = vtrn1_f32(v175, v175); + float32x2_t v213 = vtrn2_f32(v175, v175); + int64_t v215 = v210 + 1; + float32x2_t v224 = v7[v223]; + float32x2_t v225 = vtrn1_f32(v193, v193); + float32x2_t v226 = vtrn2_f32(v193, v193); + int64_t v228 = v223 + 1; + float32x2_t v273 = v7[v272]; + float32x2_t v274 = vtrn1_f32(v237, v237); + float32x2_t v275 = vtrn2_f32(v237, v237); + int64_t v277 = v272 + 1; + float32x2_t v286 = v7[v285]; + float32x2_t v287 = vtrn1_f32(v255, v255); + float32x2_t v288 = vtrn2_f32(v255, v255); + int64_t v290 = v285 + 1; + float32x2_t v335 = v7[v334]; + float32x2_t v336 = vtrn1_f32(v299, v299); + float32x2_t v337 = vtrn2_f32(v299, v299); + int64_t v339 = v334 + 1; + float32x2_t v348 = v7[v347]; + float32x2_t v349 = vtrn1_f32(v317, v317); + float32x2_t v350 = vtrn2_f32(v317, v317); + int64_t v352 = v347 + 1; + float32x2_t v397 = v7[v396]; + float32x2_t v398 = vtrn1_f32(v361, v361); + float32x2_t v399 = vtrn2_f32(v361, v361); + int64_t v401 = v396 + 1; + float32x2_t v410 = v7[v409]; + float32x2_t v411 = vtrn1_f32(v379, v379); + float32x2_t v412 = vtrn2_f32(v379, v379); + int64_t v414 = v409 + 1; + float32x2_t v459 = v7[v458]; + float32x2_t v460 = vtrn1_f32(v423, v423); + float32x2_t v461 = vtrn2_f32(v423, v423); + int64_t v463 = v458 + 1; + float32x2_t v472 = v7[v471]; + float32x2_t v473 = vtrn1_f32(v441, v441); + float32x2_t v474 = vtrn2_f32(v441, v441); + int64_t v476 = v471 + 1; + float32x2_t v521 = v7[v520]; + float32x2_t v522 = vtrn1_f32(v485, v485); + float32x2_t v523 = vtrn2_f32(v485, v485); + int64_t v525 = v520 + 1; + float32x2_t v534 = v7[v533]; + float32x2_t v535 = vtrn1_f32(v503, v503); + float32x2_t v536 = vtrn2_f32(v503, v503); + int64_t v538 = v533 + 1; + float32x2_t v588 = v7[v587]; + float32x2_t v589 = vmul_f32(v584, v583); + float32x2_t v596 = v7[v595]; + float32x2_t v597 = vtrn1_f32(v565, v565); + float32x2_t v598 = vtrn2_f32(v565, v565); + int64_t v600 = v595 + 1; + float32x2_t v43 = v7[v42]; + float32x2_t v44 = vmul_f32(v39, v38); + float32x2_t v92 = v7[v91]; + float32x2_t v93 = vmul_f32(v88, v87); + float32x2_t v105 = v7[v104]; + float32x2_t v106 = vmul_f32(v101, v100); + float32x2_t v154 = v7[v153]; + float32x2_t v155 = vmul_f32(v150, v149); + float32x2_t v167 = v7[v166]; + float32x2_t v168 = vmul_f32(v163, v162); + float32x2_t v216 = v7[v215]; + float32x2_t v217 = vmul_f32(v212, v211); + float32x2_t v229 = v7[v228]; + float32x2_t v230 = vmul_f32(v225, v224); + float32x2_t v278 = v7[v277]; + float32x2_t v279 = vmul_f32(v274, v273); + float32x2_t v291 = v7[v290]; + float32x2_t v292 = vmul_f32(v287, v286); + float32x2_t v340 = v7[v339]; + float32x2_t v341 = vmul_f32(v336, v335); + float32x2_t v353 = v7[v352]; + float32x2_t v354 = vmul_f32(v349, v348); + float32x2_t v402 = v7[v401]; + float32x2_t v403 = vmul_f32(v398, v397); + float32x2_t v415 = v7[v414]; + float32x2_t v416 = vmul_f32(v411, v410); + float32x2_t v464 = v7[v463]; + float32x2_t v465 = vmul_f32(v460, v459); + float32x2_t v477 = v7[v476]; + float32x2_t v478 = vmul_f32(v473, v472); + float32x2_t v526 = v7[v525]; + float32x2_t v527 = vmul_f32(v522, v521); + float32x2_t v539 = v7[v538]; + float32x2_t v540 = vmul_f32(v535, v534); + float32x2_t v601 = v7[v600]; + float32x2_t v602 = vmul_f32(v597, v596); + float32x2_t v591 = vfma_f32(v589, v585, v588); + float32x2_t v46 = vfma_f32(v44, v40, v43); + float32x2_t v95 = vfma_f32(v93, v89, v92); + float32x2_t v108 = vfma_f32(v106, v102, v105); + float32x2_t v157 = vfma_f32(v155, v151, v154); + float32x2_t v170 = vfma_f32(v168, v164, v167); + float32x2_t v219 = vfma_f32(v217, v213, v216); + float32x2_t v232 = vfma_f32(v230, v226, v229); + float32x2_t v281 = vfma_f32(v279, v275, v278); + float32x2_t v294 = vfma_f32(v292, v288, v291); + float32x2_t v343 = vfma_f32(v341, v337, v340); + float32x2_t v356 = vfma_f32(v354, v350, v353); + float32x2_t v405 = vfma_f32(v403, v399, v402); + float32x2_t v418 = vfma_f32(v416, v412, v415); + float32x2_t v467 = vfma_f32(v465, v461, v464); + float32x2_t v480 = vfma_f32(v478, v474, v477); + float32x2_t v529 = vfma_f32(v527, v523, v526); + float32x2_t v542 = vfma_f32(v540, v536, v539); + float32x2_t v604 = vfma_f32(v602, v598, v601); + float32x2_t v610 = vadd_f32(v609, v46); + float32x2_t v611 = vsub_f32(v609, v46); + float32x2_t v612 = vadd_f32(v95, v108); + float32x2_t v613 = vsub_f32(v95, v108); + float32x2_t v616 = vadd_f32(v157, v170); + float32x2_t v617 = vsub_f32(v157, v170); + float32x2_t v618 = vadd_f32(v219, v232); + float32x2_t v619 = vsub_f32(v219, v232); + float32x2_t v622 = vadd_f32(v281, v294); + float32x2_t v623 = vsub_f32(v281, v294); + float32x2_t v624 = vadd_f32(v343, v356); + float32x2_t v625 = vsub_f32(v343, v356); + float32x2_t v628 = vadd_f32(v405, v418); + float32x2_t v629 = vsub_f32(v405, v418); + float32x2_t v630 = vadd_f32(v467, v480); + float32x2_t v631 = vsub_f32(v467, v480); + float32x2_t v634 = vadd_f32(v529, v542); + float32x2_t v635 = vsub_f32(v529, v542); + float32x2_t v636 = vadd_f32(v591, v604); + float32x2_t v637 = vsub_f32(v591, v604); + float32x2_t v614 = vadd_f32(v610, v612); + float32x2_t v615 = vsub_f32(v610, v612); + float32x2_t v620 = vadd_f32(v616, v618); + float32x2_t v621 = vsub_f32(v616, v618); + float32x2_t v626 = vadd_f32(v622, v624); + float32x2_t v627 = vsub_f32(v622, v624); + float32x2_t v632 = vadd_f32(v628, v630); + float32x2_t v633 = vsub_f32(v628, v630); + float32x2_t v638 = vadd_f32(v634, v636); + float32x2_t v639 = vsub_f32(v634, v636); + float32x2_t v740 = vadd_f32(v617, v635); + float32x2_t v741 = vsub_f32(v617, v635); + float32x2_t v742 = vadd_f32(v629, v623); + float32x2_t v743 = vsub_f32(v629, v623); + float32x2_t v790 = vadd_f32(v619, v637); + float32x2_t v791 = vsub_f32(v619, v637); + float32x2_t v792 = vadd_f32(v631, v625); + float32x2_t v793 = vsub_f32(v631, v625); + float32x2_t v640 = vadd_f32(v620, v638); + float32x2_t v641 = vsub_f32(v620, v638); + float32x2_t v642 = vadd_f32(v632, v626); + float32x2_t v643 = vsub_f32(v632, v626); + float32x2_t v690 = vadd_f32(v621, v639); + float32x2_t v691 = vsub_f32(v621, v639); + float32x2_t v692 = vadd_f32(v633, v627); + float32x2_t v693 = vsub_f32(v633, v627); + float32x2_t v744 = vadd_f32(v740, v742); + float32x2_t v745 = vsub_f32(v740, v742); + float32x2_t v746 = vadd_f32(v741, v743); + float32x2_t v765 = vrev64_f32(v741); + float32x2_t v779 = vrev64_f32(v743); + float32x2_t v794 = vadd_f32(v790, v792); + float32x2_t v795 = vsub_f32(v790, v792); + float32x2_t v796 = vadd_f32(v791, v793); + float32x2_t v822 = vmul_f32(v791, v821); + float32x2_t v830 = vmul_f32(v793, v829); + float32x2_t v644 = vadd_f32(v640, v642); + float32x2_t v645 = vsub_f32(v640, v642); + float32x2_t v646 = vadd_f32(v641, v643); + float32x2_t v665 = vrev64_f32(v641); + float32x2_t v679 = vrev64_f32(v643); + float32x2_t v694 = vadd_f32(v690, v692); + float32x2_t v695 = vsub_f32(v690, v692); + float32x2_t v696 = vadd_f32(v691, v693); + float32x2_t v715 = vrev64_f32(v691); + float32x2_t v729 = vrev64_f32(v693); + float32x2_t v747 = vadd_f32(v744, v611); + float32x2_t v755 = vmul_f32(v744, v754); + float32x2_t v759 = vmul_f32(v745, v758); + float32x2_t v766 = vmul_f32(v765, v764); + float32x2_t v772 = vrev64_f32(v746); + float32x2_t v780 = vmul_f32(v779, v778); + float32x2_t v797 = vadd_f32(v794, v613); + float32x2_t v810 = vrev64_f32(v794); + float32x2_t v817 = vrev64_f32(v795); + float32x2_t v826 = vmul_f32(v796, v825); + float32x2_t v647 = vadd_f32(v644, v614); + float32x2_t v655 = vmul_f32(v644, v754); + float32x2_t v659 = vmul_f32(v645, v758); + float32x2_t v666 = vmul_f32(v665, v764); + float32x2_t v672 = vrev64_f32(v646); + float32x2_t v680 = vmul_f32(v679, v778); + float32x2_t v697 = vadd_f32(v694, v615); + float32x2_t v705 = vmul_f32(v694, v754); + float32x2_t v709 = vmul_f32(v695, v758); + float32x2_t v716 = vmul_f32(v715, v764); + float32x2_t v722 = vrev64_f32(v696); + float32x2_t v730 = vmul_f32(v729, v778); + float32x2_t v773 = vmul_f32(v772, v771); + float32x2_t v781 = vadd_f32(v747, v755); + float32x2_t v803 = vrev64_f32(v797); + float32x2_t v811 = vmul_f32(v810, v809); + float32x2_t v818 = vmul_f32(v817, v816); + float32x2_t v834 = vsub_f32(v822, v826); + float32x2_t v835 = vadd_f32(v826, v830); + float32x2_t v673 = vmul_f32(v672, v771); + float32x2_t v681 = vadd_f32(v647, v655); + float32x2_t v723 = vmul_f32(v722, v771); + float32x2_t v731 = vadd_f32(v697, v705); + float32x2_t v782 = vadd_f32(v781, v759); + float32x2_t v783 = vsub_f32(v781, v759); + float32x2_t v784 = vsub_f32(v766, v773); + float32x2_t v785 = vadd_f32(v773, v780); + float32x2_t v804 = vmul_f32(v803, v802); + int16x4_t v844 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v647, 15), (int32x2_t){0, 0})); + int16x4_t v856 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v697, 15), (int32x2_t){0, 0})); + float32x2_t v682 = vadd_f32(v681, v659); + float32x2_t v683 = vsub_f32(v681, v659); + float32x2_t v684 = vsub_f32(v666, v673); + float32x2_t v685 = vadd_f32(v673, v680); + float32x2_t v732 = vadd_f32(v731, v709); + float32x2_t v733 = vsub_f32(v731, v709); + float32x2_t v734 = vsub_f32(v716, v723); + float32x2_t v735 = vadd_f32(v723, v730); + float32x2_t v786 = vadd_f32(v782, v784); + float32x2_t v787 = vsub_f32(v782, v784); + float32x2_t v788 = vadd_f32(v783, v785); + float32x2_t v789 = vsub_f32(v783, v785); + float32x2_t v831 = vadd_f32(v804, v811); + float32x2_t v840 = vadd_f32(v747, v804); + float32x2_t v841 = vsub_f32(v747, v804); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v844), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v856), 0); + float32x2_t v686 = vadd_f32(v682, v684); + float32x2_t v687 = vsub_f32(v682, v684); + float32x2_t v688 = vadd_f32(v683, v685); + float32x2_t v689 = vsub_f32(v683, v685); + float32x2_t v736 = vadd_f32(v732, v734); + float32x2_t v737 = vsub_f32(v732, v734); + float32x2_t v738 = vadd_f32(v733, v735); + float32x2_t v739 = vsub_f32(v733, v735); + float32x2_t v832 = vadd_f32(v831, v818); + float32x2_t v833 = vsub_f32(v831, v818); + int16x4_t v850 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v841, 15), (int32x2_t){0, 0})); + int16x4_t v862 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v840, 15), (int32x2_t){0, 0})); + float32x2_t v836 = vadd_f32(v832, v834); + float32x2_t v837 = vsub_f32(v832, v834); + float32x2_t v838 = vadd_f32(v833, v835); + float32x2_t v839 = vsub_f32(v833, v835); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v850), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v862), 0); + int16x4_t v870 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v687, 15), (int32x2_t){0, 0})); + int16x4_t v882 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v737, 15), (int32x2_t){0, 0})); + int16x4_t v896 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v689, 15), (int32x2_t){0, 0})); + int16x4_t v908 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v739, 15), (int32x2_t){0, 0})); + int16x4_t v922 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v688, 15), (int32x2_t){0, 0})); + int16x4_t v934 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v738, 15), (int32x2_t){0, 0})); + int16x4_t v948 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v686, 15), (int32x2_t){0, 0})); + int16x4_t v960 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v736, 15), (int32x2_t){0, 0})); + float32x2_t v866 = vadd_f32(v787, v837); + float32x2_t v867 = vsub_f32(v787, v837); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v870), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v882), 0); + float32x2_t v892 = vadd_f32(v789, v839); + float32x2_t v893 = vsub_f32(v789, v839); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v896), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v908), 0); + float32x2_t v918 = vadd_f32(v788, v838); + float32x2_t v919 = vsub_f32(v788, v838); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v922), 0); + v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v934), 0); + float32x2_t v944 = vadd_f32(v786, v836); + float32x2_t v945 = vsub_f32(v786, v836); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v948), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v960), 0); + int16x4_t v876 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v867, 15), (int32x2_t){0, 0})); + int16x4_t v888 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v866, 15), (int32x2_t){0, 0})); + int16x4_t v902 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v893, 15), (int32x2_t){0, 0})); + int16x4_t v914 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v892, 15), (int32x2_t){0, 0})); + int16x4_t v928 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v919, 15), (int32x2_t){0, 0})); + int16x4_t v940 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v918, 15), (int32x2_t){0, 0})); + int16x4_t v954 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v945, 15), (int32x2_t){0, 0})); + int16x4_t v966 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v944, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v876), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v888), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v902), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v914), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v928), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v940), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v954), 0); + v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v966), 0); + v5 += 1 * idist; + v6 += 1 * odist; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs20(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + int64_t v3 = odist; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * v3; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + int64_t v13 = j; + float v574 = -1.2500000000000000e+00F; + float v579 = 5.5901699437494745e-01F; + float v622 = -1.0000000000000000e+00F; + float v629 = 1.2500000000000000e+00F; + float v636 = -5.5901699437494745e-01F; + float v643 = -1.5388417685876268e+00F; + float v648 = -5.8778525229247325e-01F; + float v653 = -3.6327126400268028e-01F; + const float32x2_t *v995 = &v5[v0]; + int32_t *v1095 = &v6[v2]; + int64_t v19 = v0 * 10; + int64_t v34 = v10 * 9; + int64_t v40 = v0 * 5; + int64_t v54 = v0 * 15; + int64_t v69 = v10 * 4; + int64_t v76 = v10 * 14; + int64_t v82 = v0 * 4; + int64_t v96 = v0 * 14; + int64_t v111 = v10 * 3; + int64_t v118 = v10 * 13; + int64_t v124 = v0 * 9; + int64_t v138 = v0 * 19; + int64_t v153 = v10 * 8; + int64_t v160 = v10 * 18; + int64_t v166 = v0 * 8; + int64_t v180 = v0 * 18; + int64_t v195 = v10 * 7; + int64_t v202 = v10 * 17; + int64_t v208 = v0 * 13; + int64_t v222 = v0 * 3; + int64_t v237 = v10 * 12; + int64_t v244 = v10 * 2; + int64_t v250 = v0 * 12; + int64_t v264 = v0 * 2; + int64_t v279 = v10 * 11; + int64_t v292 = v0 * 17; + int64_t v306 = v0 * 7; + int64_t v321 = v10 * 16; + int64_t v328 = v10 * 6; + int64_t v334 = v0 * 16; + int64_t v348 = v0 * 6; + int64_t v363 = v10 * 15; + int64_t v370 = v10 * 5; + int64_t v390 = v0 * 11; + int64_t v412 = v10 * 10; + int64_t v413 = v13 * 19; + float v587 = v4 * v643; + float v594 = v4 * v648; + float v601 = v4 * v653; + float v625 = v4 * v622; + float v632 = v4 * v629; + float v639 = v4 * v636; + int64_t v677 = v2 * 5; + int64_t v685 = v2 * 10; + int64_t v693 = v2 * 15; + int64_t v703 = v2 * 16; + int64_t v719 = v2 * 6; + int64_t v727 = v2 * 11; + int64_t v737 = v2 * 12; + int64_t v745 = v2 * 17; + int64_t v753 = v2 * 2; + int64_t v761 = v2 * 7; + int64_t v771 = v2 * 8; + int64_t v779 = v2 * 13; + int64_t v787 = v2 * 18; + int64_t v795 = v2 * 3; + int64_t v805 = v2 * 4; + int64_t v813 = v2 * 9; + int64_t v821 = v2 * 14; + int64_t v829 = v2 * 19; + const float32x2_t *v1016 = &v5[0]; + svint64_t v1017 = svindex_s64(0, v1); + svfloat32_t v1032 = svdup_n_f32(v574); + svfloat32_t v1033 = svdup_n_f32(v579); + svfloat32_t v1040 = svdup_n_f32(v643); + svfloat32_t v1041 = svdup_n_f32(v648); + svfloat32_t v1042 = svdup_n_f32(v653); + int32_t *v1050 = &v6[0]; + svint64_t v1222 = svindex_s64(0, v3); + int64_t v36 = v34 + v413; + int64_t v71 = v69 + v413; + int64_t v78 = v76 + v413; + int64_t v113 = v111 + v413; + int64_t v120 = v118 + v413; + int64_t v155 = v153 + v413; + int64_t v162 = v160 + v413; + int64_t v197 = v195 + v413; + int64_t v204 = v202 + v413; + int64_t v239 = v237 + v413; + int64_t v246 = v244 + v413; + int64_t v281 = v279 + v413; + int64_t v288 = v10 + v413; + int64_t v323 = v321 + v413; + int64_t v330 = v328 + v413; + int64_t v365 = v363 + v413; + int64_t v372 = v370 + v413; + svfloat32_t v408 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v413])); + int64_t v414 = v412 + v413; + const float32x2_t *v842 = &v5[v19]; + const float32x2_t *v851 = &v5[v40]; + const float32x2_t *v860 = &v5[v54]; + const float32x2_t *v869 = &v5[v82]; + const float32x2_t *v878 = &v5[v96]; + const float32x2_t *v887 = &v5[v124]; + const float32x2_t *v896 = &v5[v138]; + const float32x2_t *v905 = &v5[v166]; + const float32x2_t *v914 = &v5[v180]; + const float32x2_t *v923 = &v5[v208]; + const float32x2_t *v932 = &v5[v222]; + const float32x2_t *v941 = &v5[v250]; + const float32x2_t *v950 = &v5[v264]; + const float32x2_t *v959 = &v5[v292]; + const float32x2_t *v968 = &v5[v306]; + const float32x2_t *v977 = &v5[v334]; + const float32x2_t *v986 = &v5[v348]; + svfloat32_t v997 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v995), v1017)); + const float32x2_t *v1005 = &v5[v390]; + svfloat32_t v1018 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1016), v1017)); + svfloat32_t v1034 = svdup_n_f32(v587); + svfloat32_t v1035 = svdup_n_f32(v594); + svfloat32_t v1036 = svdup_n_f32(v601); + svfloat32_t v1037 = svdup_n_f32(v625); + svfloat32_t v1038 = svdup_n_f32(v632); + svfloat32_t v1039 = svdup_n_f32(v639); + int32_t *v1059 = &v6[v677]; + int32_t *v1068 = &v6[v685]; + int32_t *v1077 = &v6[v693]; + int32_t *v1086 = &v6[v703]; + int32_t *v1104 = &v6[v719]; + int32_t *v1113 = &v6[v727]; + int32_t *v1122 = &v6[v737]; + int32_t *v1131 = &v6[v745]; + int32_t *v1140 = &v6[v753]; + int32_t *v1149 = &v6[v761]; + int32_t *v1158 = &v6[v771]; + int32_t *v1167 = &v6[v779]; + int32_t *v1176 = &v6[v787]; + int32_t *v1185 = &v6[v795]; + int32_t *v1194 = &v6[v805]; + int32_t *v1203 = &v6[v813]; + int32_t *v1212 = &v6[v821]; + int32_t *v1221 = &v6[v829]; + svfloat32_t v37 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v36])); + svfloat32_t v72 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v71])); + svfloat32_t v79 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v78])); + svfloat32_t v114 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v113])); + svfloat32_t v121 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v120])); + svfloat32_t v156 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v155])); + svfloat32_t v163 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v162])); + svfloat32_t v198 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v197])); + svfloat32_t v205 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v204])); + svfloat32_t v240 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v239])); + svfloat32_t v247 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v246])); + svfloat32_t v282 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v281])); + svfloat32_t v289 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v288])); + svfloat32_t v324 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v323])); + svfloat32_t v331 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v330])); + svfloat32_t v366 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v365])); + svfloat32_t v373 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v372])); + svfloat32_t zero409 = svdup_n_f32(0); + svfloat32_t v409 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero409, v997, v408, 0), + v997, v408, 90); + svfloat32_t v415 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v414])); + svfloat32_t v844 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v842), v1017)); + svfloat32_t v853 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v851), v1017)); + svfloat32_t v862 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v860), v1017)); + svfloat32_t v871 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v869), v1017)); + svfloat32_t v880 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v878), v1017)); + svfloat32_t v889 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v887), v1017)); + svfloat32_t v898 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v896), v1017)); + svfloat32_t v907 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v905), v1017)); + svfloat32_t v916 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v914), v1017)); + svfloat32_t v925 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v923), v1017)); + svfloat32_t v934 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v932), v1017)); + svfloat32_t v943 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v941), v1017)); + svfloat32_t v952 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v950), v1017)); + svfloat32_t v961 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v959), v1017)); + svfloat32_t v970 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v968), v1017)); + svfloat32_t v979 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v977), v1017)); + svfloat32_t v988 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v986), v1017)); + svfloat32_t v1007 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1005), v1017)); + svfloat32_t zero38 = svdup_n_f32(0); + svfloat32_t v38 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v844, v37, 0), + v844, v37, 90); + svfloat32_t zero73 = svdup_n_f32(0); + svfloat32_t v73 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v853, v72, 0), + v853, v72, 90); + svfloat32_t zero80 = svdup_n_f32(0); + svfloat32_t v80 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v862, v79, 0), + v862, v79, 90); + svfloat32_t zero115 = svdup_n_f32(0); + svfloat32_t v115 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero115, v871, v114, 0), + v871, v114, 90); + svfloat32_t zero122 = svdup_n_f32(0); + svfloat32_t v122 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v880, v121, 0), + v880, v121, 90); + svfloat32_t zero157 = svdup_n_f32(0); + svfloat32_t v157 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v889, v156, 0), + v889, v156, 90); + svfloat32_t zero164 = svdup_n_f32(0); + svfloat32_t v164 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero164, v898, v163, 0), + v898, v163, 90); + svfloat32_t zero199 = svdup_n_f32(0); + svfloat32_t v199 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero199, v907, v198, 0), + v907, v198, 90); + svfloat32_t zero206 = svdup_n_f32(0); + svfloat32_t v206 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v916, v205, 0), + v916, v205, 90); + svfloat32_t zero241 = svdup_n_f32(0); + svfloat32_t v241 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero241, v925, v240, 0), + v925, v240, 90); + svfloat32_t zero248 = svdup_n_f32(0); + svfloat32_t v248 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero248, v934, v247, 0), + v934, v247, 90); + svfloat32_t zero283 = svdup_n_f32(0); + svfloat32_t v283 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero283, v943, v282, 0), + v943, v282, 90); + svfloat32_t zero290 = svdup_n_f32(0); + svfloat32_t v290 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero290, v952, v289, 0), + v952, v289, 90); + svfloat32_t zero325 = svdup_n_f32(0); + svfloat32_t v325 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero325, v961, v324, 0), + v961, v324, 90); + svfloat32_t zero332 = svdup_n_f32(0); + svfloat32_t v332 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero332, v970, v331, 0), + v970, v331, 90); + svfloat32_t zero367 = svdup_n_f32(0); + svfloat32_t v367 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero367, v979, v366, 0), + v979, v366, 90); + svfloat32_t zero374 = svdup_n_f32(0); + svfloat32_t v374 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero374, v988, v373, 0), + v988, v373, 90); + svfloat32_t zero416 = svdup_n_f32(0); + svfloat32_t v416 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero416, v1007, v415, 0), v1007, + v415, 90); + svfloat32_t v424 = svadd_f32_x(svptrue_b32(), v1018, v38); + svfloat32_t v425 = svsub_f32_x(svptrue_b32(), v1018, v38); + svfloat32_t v426 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v427 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v430 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v431 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v432 = svadd_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v433 = svsub_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v436 = svadd_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v437 = svsub_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v438 = svadd_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v439 = svsub_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v442 = svadd_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v443 = svsub_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v444 = svadd_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v445 = svsub_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v448 = svadd_f32_x(svptrue_b32(), v367, v374); + svfloat32_t v449 = svsub_f32_x(svptrue_b32(), v367, v374); + svfloat32_t v450 = svadd_f32_x(svptrue_b32(), v409, v416); + svfloat32_t v451 = svsub_f32_x(svptrue_b32(), v409, v416); + svfloat32_t v428 = svadd_f32_x(svptrue_b32(), v424, v426); + svfloat32_t v429 = svsub_f32_x(svptrue_b32(), v424, v426); + svfloat32_t v434 = svadd_f32_x(svptrue_b32(), v430, v432); + svfloat32_t v435 = svsub_f32_x(svptrue_b32(), v430, v432); + svfloat32_t v440 = svadd_f32_x(svptrue_b32(), v436, v438); + svfloat32_t v441 = svsub_f32_x(svptrue_b32(), v436, v438); + svfloat32_t v446 = svadd_f32_x(svptrue_b32(), v442, v444); + svfloat32_t v447 = svsub_f32_x(svptrue_b32(), v442, v444); + svfloat32_t v452 = svadd_f32_x(svptrue_b32(), v448, v450); + svfloat32_t v453 = svsub_f32_x(svptrue_b32(), v448, v450); + svfloat32_t v560 = svadd_f32_x(svptrue_b32(), v431, v449); + svfloat32_t v561 = svsub_f32_x(svptrue_b32(), v431, v449); + svfloat32_t v562 = svadd_f32_x(svptrue_b32(), v443, v437); + svfloat32_t v563 = svsub_f32_x(svptrue_b32(), v443, v437); + svfloat32_t v613 = svadd_f32_x(svptrue_b32(), v433, v451); + svfloat32_t v614 = svsub_f32_x(svptrue_b32(), v433, v451); + svfloat32_t v615 = svadd_f32_x(svptrue_b32(), v445, v439); + svfloat32_t v616 = svsub_f32_x(svptrue_b32(), v445, v439); + svfloat32_t v454 = svadd_f32_x(svptrue_b32(), v434, v452); + svfloat32_t v455 = svsub_f32_x(svptrue_b32(), v434, v452); + svfloat32_t v456 = svadd_f32_x(svptrue_b32(), v446, v440); + svfloat32_t v457 = svsub_f32_x(svptrue_b32(), v446, v440); + svfloat32_t v507 = svadd_f32_x(svptrue_b32(), v435, v453); + svfloat32_t v508 = svsub_f32_x(svptrue_b32(), v435, v453); + svfloat32_t v509 = svadd_f32_x(svptrue_b32(), v447, v441); + svfloat32_t v510 = svsub_f32_x(svptrue_b32(), v447, v441); + svfloat32_t v564 = svadd_f32_x(svptrue_b32(), v560, v562); + svfloat32_t v565 = svsub_f32_x(svptrue_b32(), v560, v562); + svfloat32_t v566 = svadd_f32_x(svptrue_b32(), v561, v563); + svfloat32_t zero589 = svdup_n_f32(0); + svfloat32_t v589 = svcmla_f32_x(pred_full, zero589, v1034, v561, 90); + svfloat32_t v617 = svadd_f32_x(svptrue_b32(), v613, v615); + svfloat32_t v618 = svsub_f32_x(svptrue_b32(), v613, v615); + svfloat32_t v619 = svadd_f32_x(svptrue_b32(), v614, v616); + svfloat32_t v656 = svmul_f32_x(svptrue_b32(), v616, v1042); + svfloat32_t v458 = svadd_f32_x(svptrue_b32(), v454, v456); + svfloat32_t v459 = svsub_f32_x(svptrue_b32(), v454, v456); + svfloat32_t v460 = svadd_f32_x(svptrue_b32(), v455, v457); + svfloat32_t zero483 = svdup_n_f32(0); + svfloat32_t v483 = svcmla_f32_x(pred_full, zero483, v1034, v455, 90); + svfloat32_t v511 = svadd_f32_x(svptrue_b32(), v507, v509); + svfloat32_t v512 = svsub_f32_x(svptrue_b32(), v507, v509); + svfloat32_t v513 = svadd_f32_x(svptrue_b32(), v508, v510); + svfloat32_t zero536 = svdup_n_f32(0); + svfloat32_t v536 = svcmla_f32_x(pred_full, zero536, v1034, v508, 90); + svfloat32_t v567 = svadd_f32_x(svptrue_b32(), v564, v425); + svfloat32_t zero596 = svdup_n_f32(0); + svfloat32_t v596 = svcmla_f32_x(pred_full, zero596, v1035, v566, 90); + svfloat32_t v620 = svadd_f32_x(svptrue_b32(), v617, v427); + svfloat32_t zero641 = svdup_n_f32(0); + svfloat32_t v641 = svcmla_f32_x(pred_full, zero641, v1039, v618, 90); + svfloat32_t v651 = svmul_f32_x(svptrue_b32(), v619, v1041); + svfloat32_t v461 = svadd_f32_x(svptrue_b32(), v458, v428); + svfloat32_t zero490 = svdup_n_f32(0); + svfloat32_t v490 = svcmla_f32_x(pred_full, zero490, v1035, v460, 90); + svfloat32_t v514 = svadd_f32_x(svptrue_b32(), v511, v429); + svfloat32_t zero543 = svdup_n_f32(0); + svfloat32_t v543 = svcmla_f32_x(pred_full, zero543, v1035, v513, 90); + svfloat32_t v604 = svmla_f32_x(pred_full, v567, v564, v1032); + svfloat32_t v607 = svsub_f32_x(svptrue_b32(), v589, v596); + svfloat32_t v608 = svcmla_f32_x(pred_full, v596, v1036, v563, 90); + svfloat32_t zero627 = svdup_n_f32(0); + svfloat32_t v627 = svcmla_f32_x(pred_full, zero627, v1037, v620, 90); + svfloat32_t v660 = svnmls_f32_x(pred_full, v651, v614, v1040); + svfloat32_t v661 = svmla_f32_x(pred_full, v656, v619, v1041); + svfloat32_t v498 = svmla_f32_x(pred_full, v461, v458, v1032); + svfloat32_t v501 = svsub_f32_x(svptrue_b32(), v483, v490); + svfloat32_t v502 = svcmla_f32_x(pred_full, v490, v1036, v457, 90); + svfloat32_t v551 = svmla_f32_x(pred_full, v514, v511, v1032); + svfloat32_t v554 = svsub_f32_x(svptrue_b32(), v536, v543); + svfloat32_t v555 = svcmla_f32_x(pred_full, v543, v1036, v510, 90); + svfloat32_t v605 = svmla_f32_x(pred_full, v604, v565, v1033); + svfloat32_t v606 = svmls_f32_x(pred_full, v604, v565, v1033); + svfloat32_t v657 = svcmla_f32_x(pred_full, v627, v1038, v617, 90); + svfloat32_t v666 = svadd_f32_x(svptrue_b32(), v567, v627); + svfloat32_t v667 = svsub_f32_x(svptrue_b32(), v567, v627); + svint16_t v670 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v461, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v686 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v514, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v499 = svmla_f32_x(pred_full, v498, v459, v1033); + svfloat32_t v500 = svmls_f32_x(pred_full, v498, v459, v1033); + svfloat32_t v552 = svmla_f32_x(pred_full, v551, v512, v1033); + svfloat32_t v553 = svmls_f32_x(pred_full, v551, v512, v1033); + svfloat32_t v609 = svadd_f32_x(svptrue_b32(), v605, v607); + svfloat32_t v610 = svsub_f32_x(svptrue_b32(), v605, v607); + svfloat32_t v611 = svadd_f32_x(svptrue_b32(), v606, v608); + svfloat32_t v612 = svsub_f32_x(svptrue_b32(), v606, v608); + svfloat32_t v658 = svadd_f32_x(svptrue_b32(), v657, v641); + svfloat32_t v659 = svsub_f32_x(svptrue_b32(), v657, v641); + svint16_t v678 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v667, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v694 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v666, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1050), v1222, + svreinterpret_u64_s16(v670)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1068), v1222, + svreinterpret_u64_s16(v686)); + svfloat32_t v503 = svadd_f32_x(svptrue_b32(), v499, v501); + svfloat32_t v504 = svsub_f32_x(svptrue_b32(), v499, v501); + svfloat32_t v505 = svadd_f32_x(svptrue_b32(), v500, v502); + svfloat32_t v506 = svsub_f32_x(svptrue_b32(), v500, v502); + svfloat32_t v556 = svadd_f32_x(svptrue_b32(), v552, v554); + svfloat32_t v557 = svsub_f32_x(svptrue_b32(), v552, v554); + svfloat32_t v558 = svadd_f32_x(svptrue_b32(), v553, v555); + svfloat32_t v559 = svsub_f32_x(svptrue_b32(), v553, v555); + svfloat32_t v662 = svadd_f32_x(svptrue_b32(), v658, v660); + svfloat32_t v663 = svsub_f32_x(svptrue_b32(), v658, v660); + svfloat32_t v664 = svadd_f32_x(svptrue_b32(), v659, v661); + svfloat32_t v665 = svsub_f32_x(svptrue_b32(), v659, v661); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1059), v1222, + svreinterpret_u64_s16(v678)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1077), v1222, + svreinterpret_u64_s16(v694)); + svfloat32_t v700 = svadd_f32_x(svptrue_b32(), v610, v663); + svfloat32_t v701 = svsub_f32_x(svptrue_b32(), v610, v663); + svint16_t v704 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v504, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v720 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v557, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v734 = svadd_f32_x(svptrue_b32(), v612, v665); + svfloat32_t v735 = svsub_f32_x(svptrue_b32(), v612, v665); + svint16_t v738 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v506, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v754 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v559, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v768 = svadd_f32_x(svptrue_b32(), v611, v664); + svfloat32_t v769 = svsub_f32_x(svptrue_b32(), v611, v664); + svint16_t v772 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v505, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v788 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v558, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v802 = svadd_f32_x(svptrue_b32(), v609, v662); + svfloat32_t v803 = svsub_f32_x(svptrue_b32(), v609, v662); + svint16_t v806 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v503, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v822 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v556, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v712 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v701, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v728 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v700, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v746 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v735, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v762 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v734, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v780 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v769, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v796 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v768, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v814 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v803, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v830 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v802, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1086), v1222, + svreinterpret_u64_s16(v704)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1104), v1222, + svreinterpret_u64_s16(v720)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1122), v1222, + svreinterpret_u64_s16(v738)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1140), v1222, + svreinterpret_u64_s16(v754)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1158), v1222, + svreinterpret_u64_s16(v772)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1176), v1222, + svreinterpret_u64_s16(v788)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1194), v1222, + svreinterpret_u64_s16(v806)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1212), v1222, + svreinterpret_u64_s16(v822)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1095), v1222, + svreinterpret_u64_s16(v712)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1113), v1222, + svreinterpret_u64_s16(v728)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1131), v1222, + svreinterpret_u64_s16(v746)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1149), v1222, + svreinterpret_u64_s16(v762)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1167), v1222, + svreinterpret_u64_s16(v780)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1185), v1222, + svreinterpret_u64_s16(v796)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1203), v1222, + svreinterpret_u64_s16(v814)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1221), v1222, + svreinterpret_u64_s16(v830)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs21(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v402 = v5[istride]; + float v606 = -1.1666666666666665e+00F; + float v610 = 7.9015646852540022e-01F; + float v614 = 5.5854267289647742e-02F; + float v618 = 7.3430220123575241e-01F; + float v621 = 4.4095855184409838e-01F; + float v622 = -4.4095855184409838e-01F; + float v628 = 3.4087293062393137e-01F; + float v629 = -3.4087293062393137e-01F; + float v635 = -5.3396936033772524e-01F; + float v636 = 5.3396936033772524e-01F; + float v642 = 8.7484229096165667e-01F; + float v643 = -8.7484229096165667e-01F; + float v686 = -1.4999999999999998e+00F; + float v690 = 1.7499999999999996e+00F; + float v694 = -1.1852347027881001e+00F; + float v698 = -8.3781400934471603e-02F; + float v702 = -1.1014533018536286e+00F; + float v705 = -6.6143782776614746e-01F; + float v706 = 6.6143782776614746e-01F; + float v712 = -5.1130939593589697e-01F; + float v713 = 5.1130939593589697e-01F; + float v719 = 8.0095404050658769e-01F; + float v720 = -8.0095404050658769e-01F; + float v726 = -1.3122634364424848e+00F; + float v727 = 1.3122634364424848e+00F; + float v769 = 8.6602540378443871e-01F; + float v770 = -8.6602540378443871e-01F; + float v776 = -1.0103629710818451e+00F; + float v777 = 1.0103629710818451e+00F; + float v783 = 6.8429557470759583e-01F; + float v784 = -6.8429557470759583e-01F; + float v790 = 4.8371214382601155e-02F; + float v791 = -4.8371214382601155e-02F; + float v797 = 6.3592436032499466e-01F; + float v798 = -6.3592436032499466e-01F; + float32x2_t v800 = (float32x2_t){v4, v4}; + float v805 = -3.8188130791298663e-01F; + float v809 = -2.9520461738277515e-01F; + float v813 = 4.6243103089499693e-01F; + float v817 = -7.5763564827777208e-01F; + float32x2_t v439 = vtrn1_f32(v402, v402); + float32x2_t v440 = vtrn2_f32(v402, v402); + float32x2_t v564 = v5[0]; + float32x2_t v607 = (float32x2_t){v606, v606}; + float32x2_t v611 = (float32x2_t){v610, v610}; + float32x2_t v615 = (float32x2_t){v614, v614}; + float32x2_t v619 = (float32x2_t){v618, v618}; + float32x2_t v623 = (float32x2_t){v621, v622}; + float32x2_t v630 = (float32x2_t){v628, v629}; + float32x2_t v637 = (float32x2_t){v635, v636}; + float32x2_t v644 = (float32x2_t){v642, v643}; + float32x2_t v687 = (float32x2_t){v686, v686}; + float32x2_t v691 = (float32x2_t){v690, v690}; + float32x2_t v695 = (float32x2_t){v694, v694}; + float32x2_t v699 = (float32x2_t){v698, v698}; + float32x2_t v703 = (float32x2_t){v702, v702}; + float32x2_t v707 = (float32x2_t){v705, v706}; + float32x2_t v714 = (float32x2_t){v712, v713}; + float32x2_t v721 = (float32x2_t){v719, v720}; + float32x2_t v728 = (float32x2_t){v726, v727}; + float32x2_t v771 = (float32x2_t){v769, v770}; + float32x2_t v778 = (float32x2_t){v776, v777}; + float32x2_t v785 = (float32x2_t){v783, v784}; + float32x2_t v792 = (float32x2_t){v790, v791}; + float32x2_t v799 = (float32x2_t){v797, v798}; + float32x2_t v806 = (float32x2_t){v805, v805}; + float32x2_t v810 = (float32x2_t){v809, v809}; + float32x2_t v814 = (float32x2_t){v813, v813}; + float32x2_t v818 = (float32x2_t){v817, v817}; + float32x2_t v20 = v5[istride * 7]; + float32x2_t v38 = v5[istride * 14]; + int64_t v55 = 12 + j * 40; + int64_t v68 = 26 + j * 40; + float32x2_t v82 = v5[istride * 10]; + float32x2_t v100 = v5[istride * 17]; + int64_t v117 = 18 + j * 40; + int64_t v130 = 32 + j * 40; + float32x2_t v144 = v5[istride * 3]; + int64_t v148 = 4 + j * 40; + float32x2_t v162 = v5[istride * 13]; + float32x2_t v180 = v5[istride * 20]; + int64_t v197 = 24 + j * 40; + int64_t v210 = 38 + j * 40; + float32x2_t v224 = v5[istride * 6]; + int64_t v228 = 10 + j * 40; + float32x2_t v242 = v5[istride * 16]; + float32x2_t v260 = v5[istride * 2]; + int64_t v277 = 30 + j * 40; + int64_t v290 = 2 + j * 40; + float32x2_t v304 = v5[istride * 9]; + int64_t v308 = 16 + j * 40; + float32x2_t v322 = v5[istride * 19]; + float32x2_t v340 = v5[istride * 5]; + int64_t v357 = 36 + j * 40; + int64_t v370 = 8 + j * 40; + float32x2_t v384 = v5[istride * 12]; + int64_t v388 = 22 + j * 40; + float32x2_t v420 = v5[istride * 8]; + float32x2_t v438 = v7[j * 40]; + int64_t v442 = j * 40 + 1; + int64_t v450 = 14 + j * 40; + float32x2_t v464 = v5[istride * 15]; + int64_t v468 = 28 + j * 40; + float32x2_t v482 = v5[istride * 4]; + float32x2_t v500 = v5[istride * 11]; + int64_t v517 = 6 + j * 40; + int64_t v530 = 20 + j * 40; + float32x2_t v544 = v5[istride * 18]; + int64_t v548 = 34 + j * 40; + float32x2_t v625 = vmul_f32(v800, v623); + float32x2_t v632 = vmul_f32(v800, v630); + float32x2_t v639 = vmul_f32(v800, v637); + float32x2_t v646 = vmul_f32(v800, v644); + float32x2_t v709 = vmul_f32(v800, v707); + float32x2_t v716 = vmul_f32(v800, v714); + float32x2_t v723 = vmul_f32(v800, v721); + float32x2_t v730 = vmul_f32(v800, v728); + float32x2_t v773 = vmul_f32(v800, v771); + float32x2_t v780 = vmul_f32(v800, v778); + float32x2_t v787 = vmul_f32(v800, v785); + float32x2_t v794 = vmul_f32(v800, v792); + float32x2_t v801 = vmul_f32(v800, v799); + float32x2_t v56 = v7[v55]; + float32x2_t v57 = vtrn1_f32(v20, v20); + float32x2_t v58 = vtrn2_f32(v20, v20); + int64_t v60 = v55 + 1; + float32x2_t v69 = v7[v68]; + float32x2_t v70 = vtrn1_f32(v38, v38); + float32x2_t v71 = vtrn2_f32(v38, v38); + int64_t v73 = v68 + 1; + float32x2_t v118 = v7[v117]; + float32x2_t v119 = vtrn1_f32(v82, v82); + float32x2_t v120 = vtrn2_f32(v82, v82); + int64_t v122 = v117 + 1; + float32x2_t v131 = v7[v130]; + float32x2_t v132 = vtrn1_f32(v100, v100); + float32x2_t v133 = vtrn2_f32(v100, v100); + int64_t v135 = v130 + 1; + float32x2_t v149 = v7[v148]; + float32x2_t v150 = vtrn1_f32(v144, v144); + float32x2_t v151 = vtrn2_f32(v144, v144); + int64_t v153 = v148 + 1; + float32x2_t v198 = v7[v197]; + float32x2_t v199 = vtrn1_f32(v162, v162); + float32x2_t v200 = vtrn2_f32(v162, v162); + int64_t v202 = v197 + 1; + float32x2_t v211 = v7[v210]; + float32x2_t v212 = vtrn1_f32(v180, v180); + float32x2_t v213 = vtrn2_f32(v180, v180); + int64_t v215 = v210 + 1; + float32x2_t v229 = v7[v228]; + float32x2_t v230 = vtrn1_f32(v224, v224); + float32x2_t v231 = vtrn2_f32(v224, v224); + int64_t v233 = v228 + 1; + float32x2_t v278 = v7[v277]; + float32x2_t v279 = vtrn1_f32(v242, v242); + float32x2_t v280 = vtrn2_f32(v242, v242); + int64_t v282 = v277 + 1; + float32x2_t v291 = v7[v290]; + float32x2_t v292 = vtrn1_f32(v260, v260); + float32x2_t v293 = vtrn2_f32(v260, v260); + int64_t v295 = v290 + 1; + float32x2_t v309 = v7[v308]; + float32x2_t v310 = vtrn1_f32(v304, v304); + float32x2_t v311 = vtrn2_f32(v304, v304); + int64_t v313 = v308 + 1; + float32x2_t v358 = v7[v357]; + float32x2_t v359 = vtrn1_f32(v322, v322); + float32x2_t v360 = vtrn2_f32(v322, v322); + int64_t v362 = v357 + 1; + float32x2_t v371 = v7[v370]; + float32x2_t v372 = vtrn1_f32(v340, v340); + float32x2_t v373 = vtrn2_f32(v340, v340); + int64_t v375 = v370 + 1; + float32x2_t v389 = v7[v388]; + float32x2_t v390 = vtrn1_f32(v384, v384); + float32x2_t v391 = vtrn2_f32(v384, v384); + int64_t v393 = v388 + 1; + float32x2_t v443 = v7[v442]; + float32x2_t v444 = vmul_f32(v439, v438); + float32x2_t v451 = v7[v450]; + float32x2_t v452 = vtrn1_f32(v420, v420); + float32x2_t v453 = vtrn2_f32(v420, v420); + int64_t v455 = v450 + 1; + float32x2_t v469 = v7[v468]; + float32x2_t v470 = vtrn1_f32(v464, v464); + float32x2_t v471 = vtrn2_f32(v464, v464); + int64_t v473 = v468 + 1; + float32x2_t v518 = v7[v517]; + float32x2_t v519 = vtrn1_f32(v482, v482); + float32x2_t v520 = vtrn2_f32(v482, v482); + int64_t v522 = v517 + 1; + float32x2_t v531 = v7[v530]; + float32x2_t v532 = vtrn1_f32(v500, v500); + float32x2_t v533 = vtrn2_f32(v500, v500); + int64_t v535 = v530 + 1; + float32x2_t v549 = v7[v548]; + float32x2_t v550 = vtrn1_f32(v544, v544); + float32x2_t v551 = vtrn2_f32(v544, v544); + int64_t v553 = v548 + 1; + float32x2_t v61 = v7[v60]; + float32x2_t v62 = vmul_f32(v57, v56); + float32x2_t v74 = v7[v73]; + float32x2_t v75 = vmul_f32(v70, v69); + float32x2_t v123 = v7[v122]; + float32x2_t v124 = vmul_f32(v119, v118); + float32x2_t v136 = v7[v135]; + float32x2_t v137 = vmul_f32(v132, v131); + float32x2_t v154 = v7[v153]; + float32x2_t v155 = vmul_f32(v150, v149); + float32x2_t v203 = v7[v202]; + float32x2_t v204 = vmul_f32(v199, v198); + float32x2_t v216 = v7[v215]; + float32x2_t v217 = vmul_f32(v212, v211); + float32x2_t v234 = v7[v233]; + float32x2_t v235 = vmul_f32(v230, v229); + float32x2_t v283 = v7[v282]; + float32x2_t v284 = vmul_f32(v279, v278); + float32x2_t v296 = v7[v295]; + float32x2_t v297 = vmul_f32(v292, v291); + float32x2_t v314 = v7[v313]; + float32x2_t v315 = vmul_f32(v310, v309); + float32x2_t v363 = v7[v362]; + float32x2_t v364 = vmul_f32(v359, v358); + float32x2_t v376 = v7[v375]; + float32x2_t v377 = vmul_f32(v372, v371); + float32x2_t v394 = v7[v393]; + float32x2_t v395 = vmul_f32(v390, v389); + float32x2_t v456 = v7[v455]; + float32x2_t v457 = vmul_f32(v452, v451); + float32x2_t v474 = v7[v473]; + float32x2_t v475 = vmul_f32(v470, v469); + float32x2_t v523 = v7[v522]; + float32x2_t v524 = vmul_f32(v519, v518); + float32x2_t v536 = v7[v535]; + float32x2_t v537 = vmul_f32(v532, v531); + float32x2_t v554 = v7[v553]; + float32x2_t v555 = vmul_f32(v550, v549); + float32x2_t v446 = vfma_f32(v444, v440, v443); + float32x2_t v64 = vfma_f32(v62, v58, v61); + float32x2_t v77 = vfma_f32(v75, v71, v74); + float32x2_t v126 = vfma_f32(v124, v120, v123); + float32x2_t v139 = vfma_f32(v137, v133, v136); + float32x2_t v157 = vfma_f32(v155, v151, v154); + float32x2_t v206 = vfma_f32(v204, v200, v203); + float32x2_t v219 = vfma_f32(v217, v213, v216); + float32x2_t v237 = vfma_f32(v235, v231, v234); + float32x2_t v286 = vfma_f32(v284, v280, v283); + float32x2_t v299 = vfma_f32(v297, v293, v296); + float32x2_t v317 = vfma_f32(v315, v311, v314); + float32x2_t v366 = vfma_f32(v364, v360, v363); + float32x2_t v379 = vfma_f32(v377, v373, v376); + float32x2_t v397 = vfma_f32(v395, v391, v394); + float32x2_t v459 = vfma_f32(v457, v453, v456); + float32x2_t v477 = vfma_f32(v475, v471, v474); + float32x2_t v526 = vfma_f32(v524, v520, v523); + float32x2_t v539 = vfma_f32(v537, v533, v536); + float32x2_t v557 = vfma_f32(v555, v551, v554); + float32x2_t v558 = vadd_f32(v64, v77); + float32x2_t v559 = vsub_f32(v64, v77); + float32x2_t v566 = vadd_f32(v126, v139); + float32x2_t v567 = vsub_f32(v126, v139); + float32x2_t v569 = vadd_f32(v206, v219); + float32x2_t v570 = vsub_f32(v206, v219); + float32x2_t v572 = vadd_f32(v286, v299); + float32x2_t v573 = vsub_f32(v286, v299); + float32x2_t v575 = vadd_f32(v366, v379); + float32x2_t v576 = vsub_f32(v366, v379); + float32x2_t v578 = vadd_f32(v446, v459); + float32x2_t v579 = vsub_f32(v446, v459); + float32x2_t v581 = vadd_f32(v526, v539); + float32x2_t v582 = vsub_f32(v526, v539); + float32x2_t v565 = vadd_f32(v558, v564); + float32x2_t v568 = vadd_f32(v566, v157); + float32x2_t v571 = vadd_f32(v569, v237); + float32x2_t v574 = vadd_f32(v572, v317); + float32x2_t v577 = vadd_f32(v575, v397); + float32x2_t v580 = vadd_f32(v578, v477); + float32x2_t v583 = vadd_f32(v581, v557); + float32x2_t v668 = vadd_f32(v566, v581); + float32x2_t v669 = vsub_f32(v566, v581); + float32x2_t v670 = vadd_f32(v575, v572); + float32x2_t v671 = vsub_f32(v575, v572); + float32x2_t v672 = vadd_f32(v569, v578); + float32x2_t v673 = vsub_f32(v569, v578); + float32x2_t v752 = vadd_f32(v567, v582); + float32x2_t v753 = vsub_f32(v567, v582); + float32x2_t v754 = vadd_f32(v576, v573); + float32x2_t v755 = vsub_f32(v576, v573); + float32x2_t v756 = vadd_f32(v570, v579); + float32x2_t v757 = vsub_f32(v570, v579); + float32x2_t v584 = vadd_f32(v568, v583); + float32x2_t v585 = vsub_f32(v568, v583); + float32x2_t v586 = vadd_f32(v577, v574); + float32x2_t v587 = vsub_f32(v577, v574); + float32x2_t v588 = vadd_f32(v571, v580); + float32x2_t v589 = vsub_f32(v571, v580); + float32x2_t v674 = vadd_f32(v668, v670); + float32x2_t v677 = vsub_f32(v668, v670); + float32x2_t v678 = vsub_f32(v670, v672); + float32x2_t v679 = vsub_f32(v672, v668); + float32x2_t v680 = vadd_f32(v669, v671); + float32x2_t v682 = vsub_f32(v669, v671); + float32x2_t v683 = vsub_f32(v671, v673); + float32x2_t v684 = vsub_f32(v673, v669); + float32x2_t v758 = vadd_f32(v752, v754); + float32x2_t v761 = vsub_f32(v752, v754); + float32x2_t v762 = vsub_f32(v754, v756); + float32x2_t v763 = vsub_f32(v756, v752); + float32x2_t v764 = vadd_f32(v753, v755); + float32x2_t v766 = vsub_f32(v753, v755); + float32x2_t v767 = vsub_f32(v755, v757); + float32x2_t v768 = vsub_f32(v757, v753); + float32x2_t v590 = vadd_f32(v584, v586); + float32x2_t v593 = vsub_f32(v584, v586); + float32x2_t v594 = vsub_f32(v586, v588); + float32x2_t v595 = vsub_f32(v588, v584); + float32x2_t v596 = vadd_f32(v585, v587); + float32x2_t v598 = vsub_f32(v585, v587); + float32x2_t v599 = vsub_f32(v587, v589); + float32x2_t v600 = vsub_f32(v589, v585); + float32x2_t v675 = vadd_f32(v674, v672); + float32x2_t v681 = vadd_f32(v680, v673); + float32x2_t v696 = vmul_f32(v677, v695); + float32x2_t v700 = vmul_f32(v678, v699); + float32x2_t v704 = vmul_f32(v679, v703); + float32x2_t v717 = vrev64_f32(v682); + float32x2_t v724 = vrev64_f32(v683); + float32x2_t v731 = vrev64_f32(v684); + float32x2_t v759 = vadd_f32(v758, v756); + float32x2_t v765 = vadd_f32(v764, v757); + float32x2_t v788 = vrev64_f32(v761); + float32x2_t v795 = vrev64_f32(v762); + float32x2_t v802 = vrev64_f32(v763); + float32x2_t v811 = vmul_f32(v766, v810); + float32x2_t v815 = vmul_f32(v767, v814); + float32x2_t v819 = vmul_f32(v768, v818); + float32x2_t v591 = vadd_f32(v590, v588); + float32x2_t v597 = vadd_f32(v596, v589); + float32x2_t v612 = vmul_f32(v593, v611); + float32x2_t v616 = vmul_f32(v594, v615); + float32x2_t v620 = vmul_f32(v595, v619); + float32x2_t v633 = vrev64_f32(v598); + float32x2_t v640 = vrev64_f32(v599); + float32x2_t v647 = vrev64_f32(v600); + float32x2_t v676 = vadd_f32(v675, v558); + float32x2_t v692 = vmul_f32(v675, v691); + float32x2_t v710 = vrev64_f32(v681); + float32x2_t v718 = vmul_f32(v717, v716); + float32x2_t v725 = vmul_f32(v724, v723); + float32x2_t v732 = vmul_f32(v731, v730); + float32x2_t v760 = vadd_f32(v759, v559); + float32x2_t v781 = vrev64_f32(v759); + float32x2_t v789 = vmul_f32(v788, v787); + float32x2_t v796 = vmul_f32(v795, v794); + float32x2_t v803 = vmul_f32(v802, v801); + float32x2_t v807 = vmul_f32(v765, v806); + float32x2_t v592 = vadd_f32(v591, v565); + float32x2_t v608 = vmul_f32(v591, v607); + float32x2_t v626 = vrev64_f32(v597); + float32x2_t v634 = vmul_f32(v633, v632); + float32x2_t v641 = vmul_f32(v640, v639); + float32x2_t v648 = vmul_f32(v647, v646); + float32x2_t v688 = vmul_f32(v676, v687); + float32x2_t v711 = vmul_f32(v710, v709); + float32x2_t v774 = vrev64_f32(v760); + float32x2_t v782 = vmul_f32(v781, v780); + float32x2_t v827 = vadd_f32(v807, v811); + float32x2_t v829 = vsub_f32(v807, v811); + float32x2_t v831 = vsub_f32(v807, v815); + float32x2_t v627 = vmul_f32(v626, v625); + float32x2_t v649 = vadd_f32(v592, v608); + float32x2_t v733 = vadd_f32(v688, v692); + float32x2_t v740 = vadd_f32(v711, v718); + float32x2_t v742 = vsub_f32(v711, v718); + float32x2_t v744 = vsub_f32(v711, v725); + float32x2_t v775 = vmul_f32(v774, v773); + float32x2_t v828 = vadd_f32(v827, v815); + float32x2_t v830 = vsub_f32(v829, v819); + float32x2_t v832 = vadd_f32(v831, v819); + float32x2_t v839 = vadd_f32(v592, v688); + int16x4_t v844 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v592, 15), (int32x2_t){0, 0})); + float32x2_t v650 = vadd_f32(v649, v612); + float32x2_t v652 = vsub_f32(v649, v612); + float32x2_t v654 = vsub_f32(v649, v616); + float32x2_t v656 = vadd_f32(v627, v634); + float32x2_t v658 = vsub_f32(v627, v634); + float32x2_t v660 = vsub_f32(v627, v641); + float32x2_t v734 = vadd_f32(v733, v696); + float32x2_t v736 = vsub_f32(v733, v696); + float32x2_t v738 = vsub_f32(v733, v700); + float32x2_t v741 = vadd_f32(v740, v725); + float32x2_t v743 = vsub_f32(v742, v732); + float32x2_t v745 = vadd_f32(v744, v732); + float32x2_t v820 = vadd_f32(v775, v782); + float32x2_t v840 = vadd_f32(v839, v775); + float32x2_t v841 = vsub_f32(v839, v775); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v844), 0); + float32x2_t v651 = vadd_f32(v650, v616); + float32x2_t v653 = vsub_f32(v652, v620); + float32x2_t v655 = vadd_f32(v654, v620); + float32x2_t v657 = vadd_f32(v656, v641); + float32x2_t v659 = vsub_f32(v658, v648); + float32x2_t v661 = vadd_f32(v660, v648); + float32x2_t v735 = vadd_f32(v734, v700); + float32x2_t v737 = vsub_f32(v736, v704); + float32x2_t v739 = vadd_f32(v738, v704); + float32x2_t v821 = vadd_f32(v820, v789); + float32x2_t v823 = vsub_f32(v820, v789); + float32x2_t v825 = vsub_f32(v820, v796); + int16x4_t v850 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v841, 15), (int32x2_t){0, 0})); + int16x4_t v856 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v840, 15), (int32x2_t){0, 0})); + float32x2_t v662 = vadd_f32(v651, v657); + float32x2_t v663 = vsub_f32(v651, v657); + float32x2_t v664 = vadd_f32(v653, v659); + float32x2_t v665 = vsub_f32(v653, v659); + float32x2_t v666 = vadd_f32(v655, v661); + float32x2_t v667 = vsub_f32(v655, v661); + float32x2_t v746 = vadd_f32(v735, v741); + float32x2_t v747 = vsub_f32(v735, v741); + float32x2_t v748 = vadd_f32(v737, v743); + float32x2_t v749 = vsub_f32(v737, v743); + float32x2_t v750 = vadd_f32(v739, v745); + float32x2_t v751 = vsub_f32(v739, v745); + float32x2_t v822 = vadd_f32(v821, v796); + float32x2_t v824 = vsub_f32(v823, v803); + float32x2_t v826 = vadd_f32(v825, v803); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v850), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v856), 0); + float32x2_t v833 = vadd_f32(v822, v828); + float32x2_t v834 = vsub_f32(v822, v828); + float32x2_t v835 = vadd_f32(v824, v830); + float32x2_t v836 = vsub_f32(v824, v830); + float32x2_t v837 = vadd_f32(v826, v832); + float32x2_t v838 = vsub_f32(v826, v832); + float32x2_t v860 = vadd_f32(v663, v747); + int16x4_t v865 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v663, 15), (int32x2_t){0, 0})); + float32x2_t v881 = vadd_f32(v665, v749); + int16x4_t v886 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v665, 15), (int32x2_t){0, 0})); + float32x2_t v902 = vadd_f32(v666, v750); + int16x4_t v907 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v666, 15), (int32x2_t){0, 0})); + float32x2_t v923 = vadd_f32(v667, v751); + int16x4_t v928 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v667, 15), (int32x2_t){0, 0})); + float32x2_t v944 = vadd_f32(v664, v748); + int16x4_t v949 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v664, 15), (int32x2_t){0, 0})); + float32x2_t v965 = vadd_f32(v662, v746); + int16x4_t v970 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v662, 15), (int32x2_t){0, 0})); + float32x2_t v861 = vadd_f32(v860, v834); + float32x2_t v862 = vsub_f32(v860, v834); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v865), 0); + float32x2_t v882 = vadd_f32(v881, v836); + float32x2_t v883 = vsub_f32(v881, v836); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v886), 0); + float32x2_t v903 = vadd_f32(v902, v837); + float32x2_t v904 = vsub_f32(v902, v837); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v907), 0); + float32x2_t v924 = vadd_f32(v923, v838); + float32x2_t v925 = vsub_f32(v923, v838); + v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v928), 0); + float32x2_t v945 = vadd_f32(v944, v835); + float32x2_t v946 = vsub_f32(v944, v835); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v949), 0); + float32x2_t v966 = vadd_f32(v965, v833); + float32x2_t v967 = vsub_f32(v965, v833); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v970), 0); + int16x4_t v871 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v862, 15), (int32x2_t){0, 0})); + int16x4_t v877 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v861, 15), (int32x2_t){0, 0})); + int16x4_t v892 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v883, 15), (int32x2_t){0, 0})); + int16x4_t v898 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v882, 15), (int32x2_t){0, 0})); + int16x4_t v913 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v904, 15), (int32x2_t){0, 0})); + int16x4_t v919 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v903, 15), (int32x2_t){0, 0})); + int16x4_t v934 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v925, 15), (int32x2_t){0, 0})); + int16x4_t v940 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v924, 15), (int32x2_t){0, 0})); + int16x4_t v955 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v946, 15), (int32x2_t){0, 0})); + int16x4_t v961 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v945, 15), (int32x2_t){0, 0})); + int16x4_t v976 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v967, 15), (int32x2_t){0, 0})); + int16x4_t v982 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v966, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v871), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v877), 0); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v892), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v898), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v913), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v919), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v934), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v940), 0); + v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v955), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v961), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v976), 0); + v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v982), 0); + v5 += 1 * idist; + v6 += 1 * odist; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs21(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + int64_t v3 = odist; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * v3; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + int64_t v13 = j; + float v447 = -1.1666666666666665e+00F; + float v452 = 7.9015646852540022e-01F; + float v457 = 5.5854267289647742e-02F; + float v462 = 7.3430220123575241e-01F; + float v467 = -4.4095855184409838e-01F; + float v474 = -3.4087293062393137e-01F; + float v481 = 5.3396936033772524e-01F; + float v488 = -8.7484229096165667e-01F; + float v531 = -1.4999999999999998e+00F; + float v536 = 1.7499999999999996e+00F; + float v541 = -1.1852347027881001e+00F; + float v546 = -8.3781400934471603e-02F; + float v551 = -1.1014533018536286e+00F; + float v556 = 6.6143782776614746e-01F; + float v563 = 5.1130939593589697e-01F; + float v570 = -8.0095404050658769e-01F; + float v577 = 1.3122634364424848e+00F; + float v620 = -8.6602540378443871e-01F; + float v627 = 1.0103629710818451e+00F; + float v634 = -6.8429557470759583e-01F; + float v641 = -4.8371214382601155e-02F; + float v648 = -6.3592436032499466e-01F; + float v655 = -3.8188130791298663e-01F; + float v660 = -2.9520461738277515e-01F; + float v665 = 4.6243103089499693e-01F; + float v670 = -7.5763564827777208e-01F; + const float32x2_t *v1014 = &v5[v0]; + int32_t *v1144 = &v6[v2]; + int64_t v19 = v0 * 7; + int64_t v33 = v0 * 14; + int64_t v48 = v10 * 6; + int64_t v55 = v10 * 13; + int64_t v61 = v0 * 10; + int64_t v75 = v0 * 17; + int64_t v90 = v10 * 9; + int64_t v97 = v10 * 16; + int64_t v103 = v0 * 3; + int64_t v111 = v10 * 2; + int64_t v117 = v0 * 13; + int64_t v131 = v0 * 20; + int64_t v146 = v10 * 12; + int64_t v153 = v10 * 19; + int64_t v159 = v0 * 6; + int64_t v167 = v10 * 5; + int64_t v173 = v0 * 16; + int64_t v187 = v0 * 2; + int64_t v202 = v10 * 15; + int64_t v215 = v0 * 9; + int64_t v223 = v10 * 8; + int64_t v229 = v0 * 19; + int64_t v243 = v0 * 5; + int64_t v258 = v10 * 18; + int64_t v265 = v10 * 4; + int64_t v271 = v0 * 12; + int64_t v279 = v10 * 11; + int64_t v299 = v0 * 8; + int64_t v321 = v10 * 7; + int64_t v327 = v0 * 15; + int64_t v335 = v10 * 14; + int64_t v341 = v0 * 4; + int64_t v355 = v0 * 11; + int64_t v370 = v10 * 3; + int64_t v377 = v10 * 10; + int64_t v383 = v0 * 18; + int64_t v391 = v10 * 17; + int64_t v392 = v13 * 20; + float v470 = v4 * v467; + float v477 = v4 * v474; + float v484 = v4 * v481; + float v491 = v4 * v488; + float v559 = v4 * v556; + float v566 = v4 * v563; + float v573 = v4 * v570; + float v580 = v4 * v577; + float v623 = v4 * v620; + float v630 = v4 * v627; + float v637 = v4 * v634; + float v644 = v4 * v641; + float v651 = v4 * v648; + int64_t v705 = v2 * 7; + int64_t v713 = v2 * 14; + int64_t v724 = v2 * 15; + int64_t v740 = v2 * 8; + int64_t v751 = v2 * 9; + int64_t v759 = v2 * 16; + int64_t v767 = v2 * 2; + int64_t v778 = v2 * 3; + int64_t v786 = v2 * 10; + int64_t v794 = v2 * 17; + int64_t v805 = v2 * 18; + int64_t v813 = v2 * 4; + int64_t v821 = v2 * 11; + int64_t v832 = v2 * 12; + int64_t v840 = v2 * 19; + int64_t v848 = v2 * 5; + int64_t v859 = v2 * 6; + int64_t v867 = v2 * 13; + int64_t v875 = v2 * 20; + const float32x2_t *v1071 = &v5[0]; + svint64_t v1072 = svindex_s64(0, v1); + svfloat32_t v1075 = svdup_n_f32(v447); + svfloat32_t v1076 = svdup_n_f32(v452); + svfloat32_t v1077 = svdup_n_f32(v457); + svfloat32_t v1078 = svdup_n_f32(v462); + svfloat32_t v1083 = svdup_n_f32(v531); + svfloat32_t v1084 = svdup_n_f32(v536); + svfloat32_t v1085 = svdup_n_f32(v541); + svfloat32_t v1086 = svdup_n_f32(v546); + svfloat32_t v1087 = svdup_n_f32(v551); + svfloat32_t v1097 = svdup_n_f32(v655); + svfloat32_t v1098 = svdup_n_f32(v660); + svfloat32_t v1099 = svdup_n_f32(v665); + svfloat32_t v1100 = svdup_n_f32(v670); + int32_t *v1108 = &v6[0]; + svint64_t v1289 = svindex_s64(0, v3); + int64_t v50 = v48 + v392; + int64_t v57 = v55 + v392; + int64_t v92 = v90 + v392; + int64_t v99 = v97 + v392; + int64_t v113 = v111 + v392; + int64_t v148 = v146 + v392; + int64_t v155 = v153 + v392; + int64_t v169 = v167 + v392; + int64_t v204 = v202 + v392; + int64_t v211 = v10 + v392; + int64_t v225 = v223 + v392; + int64_t v260 = v258 + v392; + int64_t v267 = v265 + v392; + int64_t v281 = v279 + v392; + svfloat32_t v317 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v392])); + int64_t v323 = v321 + v392; + int64_t v337 = v335 + v392; + int64_t v372 = v370 + v392; + int64_t v379 = v377 + v392; + int64_t v393 = v391 + v392; + const float32x2_t *v888 = &v5[v19]; + const float32x2_t *v897 = &v5[v33]; + const float32x2_t *v906 = &v5[v61]; + const float32x2_t *v915 = &v5[v75]; + const float32x2_t *v924 = &v5[v103]; + const float32x2_t *v933 = &v5[v117]; + const float32x2_t *v942 = &v5[v131]; + const float32x2_t *v951 = &v5[v159]; + const float32x2_t *v960 = &v5[v173]; + const float32x2_t *v969 = &v5[v187]; + const float32x2_t *v978 = &v5[v215]; + const float32x2_t *v987 = &v5[v229]; + const float32x2_t *v996 = &v5[v243]; + const float32x2_t *v1005 = &v5[v271]; + svfloat32_t v1016 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1014), v1072)); + const float32x2_t *v1024 = &v5[v299]; + const float32x2_t *v1034 = &v5[v327]; + const float32x2_t *v1043 = &v5[v341]; + const float32x2_t *v1052 = &v5[v355]; + const float32x2_t *v1061 = &v5[v383]; + svfloat32_t v1073 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1071), v1072)); + svfloat32_t v1079 = svdup_n_f32(v470); + svfloat32_t v1080 = svdup_n_f32(v477); + svfloat32_t v1081 = svdup_n_f32(v484); + svfloat32_t v1082 = svdup_n_f32(v491); + svfloat32_t v1088 = svdup_n_f32(v559); + svfloat32_t v1089 = svdup_n_f32(v566); + svfloat32_t v1090 = svdup_n_f32(v573); + svfloat32_t v1091 = svdup_n_f32(v580); + svfloat32_t v1092 = svdup_n_f32(v623); + svfloat32_t v1093 = svdup_n_f32(v630); + svfloat32_t v1094 = svdup_n_f32(v637); + svfloat32_t v1095 = svdup_n_f32(v644); + svfloat32_t v1096 = svdup_n_f32(v651); + int32_t *v1117 = &v6[v705]; + int32_t *v1126 = &v6[v713]; + int32_t *v1135 = &v6[v724]; + int32_t *v1153 = &v6[v740]; + int32_t *v1162 = &v6[v751]; + int32_t *v1171 = &v6[v759]; + int32_t *v1180 = &v6[v767]; + int32_t *v1189 = &v6[v778]; + int32_t *v1198 = &v6[v786]; + int32_t *v1207 = &v6[v794]; + int32_t *v1216 = &v6[v805]; + int32_t *v1225 = &v6[v813]; + int32_t *v1234 = &v6[v821]; + int32_t *v1243 = &v6[v832]; + int32_t *v1252 = &v6[v840]; + int32_t *v1261 = &v6[v848]; + int32_t *v1270 = &v6[v859]; + int32_t *v1279 = &v6[v867]; + int32_t *v1288 = &v6[v875]; + svfloat32_t v51 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v50])); + svfloat32_t v58 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v57])); + svfloat32_t v93 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v92])); + svfloat32_t v100 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v99])); + svfloat32_t v114 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v113])); + svfloat32_t v149 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v148])); + svfloat32_t v156 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v155])); + svfloat32_t v170 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v169])); + svfloat32_t v205 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v204])); + svfloat32_t v212 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v211])); + svfloat32_t v226 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v225])); + svfloat32_t v261 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v260])); + svfloat32_t v268 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v267])); + svfloat32_t v282 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v281])); + svfloat32_t zero318 = svdup_n_f32(0); + svfloat32_t v318 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero318, v1016, v317, 0), v1016, + v317, 90); + svfloat32_t v324 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v323])); + svfloat32_t v338 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v337])); + svfloat32_t v373 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v372])); + svfloat32_t v380 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v379])); + svfloat32_t v394 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v393])); + svfloat32_t v890 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v888), v1072)); + svfloat32_t v899 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v897), v1072)); + svfloat32_t v908 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v906), v1072)); + svfloat32_t v917 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v915), v1072)); + svfloat32_t v926 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v924), v1072)); + svfloat32_t v935 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v933), v1072)); + svfloat32_t v944 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v942), v1072)); + svfloat32_t v953 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v951), v1072)); + svfloat32_t v962 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v960), v1072)); + svfloat32_t v971 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v969), v1072)); + svfloat32_t v980 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v978), v1072)); + svfloat32_t v989 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v987), v1072)); + svfloat32_t v998 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v996), v1072)); + svfloat32_t v1007 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1005), v1072)); + svfloat32_t v1026 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1024), v1072)); + svfloat32_t v1036 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1034), v1072)); + svfloat32_t v1045 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1043), v1072)); + svfloat32_t v1054 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1052), v1072)); + svfloat32_t v1063 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1061), v1072)); + svfloat32_t zero52 = svdup_n_f32(0); + svfloat32_t v52 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v890, v51, 0), + v890, v51, 90); + svfloat32_t zero59 = svdup_n_f32(0); + svfloat32_t v59 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v899, v58, 0), + v899, v58, 90); + svfloat32_t zero94 = svdup_n_f32(0); + svfloat32_t v94 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v908, v93, 0), + v908, v93, 90); + svfloat32_t zero101 = svdup_n_f32(0); + svfloat32_t v101 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v917, v100, 0), + v917, v100, 90); + svfloat32_t zero150 = svdup_n_f32(0); + svfloat32_t v150 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero150, v935, v149, 0), + v935, v149, 90); + svfloat32_t zero157 = svdup_n_f32(0); + svfloat32_t v157 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v944, v156, 0), + v944, v156, 90); + svfloat32_t zero206 = svdup_n_f32(0); + svfloat32_t v206 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v962, v205, 0), + v962, v205, 90); + svfloat32_t zero213 = svdup_n_f32(0); + svfloat32_t v213 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero213, v971, v212, 0), + v971, v212, 90); + svfloat32_t zero262 = svdup_n_f32(0); + svfloat32_t v262 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero262, v989, v261, 0), + v989, v261, 90); + svfloat32_t zero269 = svdup_n_f32(0); + svfloat32_t v269 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero269, v998, v268, 0), + v998, v268, 90); + svfloat32_t zero325 = svdup_n_f32(0); + svfloat32_t v325 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero325, v1026, v324, 0), v1026, + v324, 90); + svfloat32_t zero374 = svdup_n_f32(0); + svfloat32_t v374 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero374, v1045, v373, 0), v1045, + v373, 90); + svfloat32_t zero381 = svdup_n_f32(0); + svfloat32_t v381 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero381, v1054, v380, 0), v1054, + v380, 90); + svfloat32_t v396 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v406 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v407 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v409 = svadd_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v410 = svsub_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v413 = svsub_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v415 = svadd_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v416 = svsub_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v418 = svadd_f32_x(svptrue_b32(), v318, v325); + svfloat32_t v419 = svsub_f32_x(svptrue_b32(), v318, v325); + svfloat32_t v421 = svadd_f32_x(svptrue_b32(), v374, v381); + svfloat32_t v422 = svsub_f32_x(svptrue_b32(), v374, v381); + svfloat32_t v405 = svadd_f32_x(svptrue_b32(), v396, v1073); + svfloat32_t v408 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v406, v926, v114, 0), + v926, v114, 90); + svfloat32_t v411 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v409, v953, v170, 0), + v953, v170, 90); + svfloat32_t v414 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v412, v980, v226, 0), + v980, v226, 90); + svfloat32_t v417 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v415, v1007, v282, 0), + v1007, v282, 90); + svfloat32_t v420 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v418, v1036, v338, 0), + v1036, v338, 90); + svfloat32_t v423 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v421, v1063, v394, 0), + v1063, v394, 90); + svfloat32_t v513 = svadd_f32_x(svptrue_b32(), v406, v421); + svfloat32_t v514 = svsub_f32_x(svptrue_b32(), v406, v421); + svfloat32_t v515 = svadd_f32_x(svptrue_b32(), v415, v412); + svfloat32_t v516 = svsub_f32_x(svptrue_b32(), v415, v412); + svfloat32_t v517 = svadd_f32_x(svptrue_b32(), v409, v418); + svfloat32_t v518 = svsub_f32_x(svptrue_b32(), v409, v418); + svfloat32_t v602 = svadd_f32_x(svptrue_b32(), v407, v422); + svfloat32_t v603 = svsub_f32_x(svptrue_b32(), v407, v422); + svfloat32_t v604 = svadd_f32_x(svptrue_b32(), v416, v413); + svfloat32_t v605 = svsub_f32_x(svptrue_b32(), v416, v413); + svfloat32_t v606 = svadd_f32_x(svptrue_b32(), v410, v419); + svfloat32_t v607 = svsub_f32_x(svptrue_b32(), v410, v419); + svfloat32_t v424 = svadd_f32_x(svptrue_b32(), v408, v423); + svfloat32_t v425 = svsub_f32_x(svptrue_b32(), v408, v423); + svfloat32_t v426 = svadd_f32_x(svptrue_b32(), v417, v414); + svfloat32_t v427 = svsub_f32_x(svptrue_b32(), v417, v414); + svfloat32_t v428 = svadd_f32_x(svptrue_b32(), v411, v420); + svfloat32_t v429 = svsub_f32_x(svptrue_b32(), v411, v420); + svfloat32_t v519 = svadd_f32_x(svptrue_b32(), v513, v515); + svfloat32_t v522 = svsub_f32_x(svptrue_b32(), v513, v515); + svfloat32_t v523 = svsub_f32_x(svptrue_b32(), v515, v517); + svfloat32_t v524 = svsub_f32_x(svptrue_b32(), v517, v513); + svfloat32_t v525 = svadd_f32_x(svptrue_b32(), v514, v516); + svfloat32_t v527 = svsub_f32_x(svptrue_b32(), v514, v516); + svfloat32_t v528 = svsub_f32_x(svptrue_b32(), v516, v518); + svfloat32_t v529 = svsub_f32_x(svptrue_b32(), v518, v514); + svfloat32_t v608 = svadd_f32_x(svptrue_b32(), v602, v604); + svfloat32_t v611 = svsub_f32_x(svptrue_b32(), v602, v604); + svfloat32_t v612 = svsub_f32_x(svptrue_b32(), v604, v606); + svfloat32_t v613 = svsub_f32_x(svptrue_b32(), v606, v602); + svfloat32_t v614 = svadd_f32_x(svptrue_b32(), v603, v605); + svfloat32_t v616 = svsub_f32_x(svptrue_b32(), v603, v605); + svfloat32_t v617 = svsub_f32_x(svptrue_b32(), v605, v607); + svfloat32_t v618 = svsub_f32_x(svptrue_b32(), v607, v603); + svfloat32_t v430 = svadd_f32_x(svptrue_b32(), v424, v426); + svfloat32_t v433 = svsub_f32_x(svptrue_b32(), v424, v426); + svfloat32_t v434 = svsub_f32_x(svptrue_b32(), v426, v428); + svfloat32_t v435 = svsub_f32_x(svptrue_b32(), v428, v424); + svfloat32_t v436 = svadd_f32_x(svptrue_b32(), v425, v427); + svfloat32_t v438 = svsub_f32_x(svptrue_b32(), v425, v427); + svfloat32_t v439 = svsub_f32_x(svptrue_b32(), v427, v429); + svfloat32_t v440 = svsub_f32_x(svptrue_b32(), v429, v425); + svfloat32_t v520 = svadd_f32_x(svptrue_b32(), v519, v517); + svfloat32_t v526 = svadd_f32_x(svptrue_b32(), v525, v518); + svfloat32_t zero568 = svdup_n_f32(0); + svfloat32_t v568 = svcmla_f32_x(pred_full, zero568, v1089, v527, 90); + svfloat32_t zero575 = svdup_n_f32(0); + svfloat32_t v575 = svcmla_f32_x(pred_full, zero575, v1090, v528, 90); + svfloat32_t zero582 = svdup_n_f32(0); + svfloat32_t v582 = svcmla_f32_x(pred_full, zero582, v1091, v529, 90); + svfloat32_t v609 = svadd_f32_x(svptrue_b32(), v608, v606); + svfloat32_t v615 = svadd_f32_x(svptrue_b32(), v614, v607); + svfloat32_t zero639 = svdup_n_f32(0); + svfloat32_t v639 = svcmla_f32_x(pred_full, zero639, v1094, v611, 90); + svfloat32_t zero646 = svdup_n_f32(0); + svfloat32_t v646 = svcmla_f32_x(pred_full, zero646, v1095, v612, 90); + svfloat32_t zero653 = svdup_n_f32(0); + svfloat32_t v653 = svcmla_f32_x(pred_full, zero653, v1096, v613, 90); + svfloat32_t v663 = svmul_f32_x(svptrue_b32(), v616, v1098); + svfloat32_t v668 = svmul_f32_x(svptrue_b32(), v617, v1099); + svfloat32_t v431 = svadd_f32_x(svptrue_b32(), v430, v428); + svfloat32_t v437 = svadd_f32_x(svptrue_b32(), v436, v429); + svfloat32_t zero479 = svdup_n_f32(0); + svfloat32_t v479 = svcmla_f32_x(pred_full, zero479, v1080, v438, 90); + svfloat32_t zero486 = svdup_n_f32(0); + svfloat32_t v486 = svcmla_f32_x(pred_full, zero486, v1081, v439, 90); + svfloat32_t zero493 = svdup_n_f32(0); + svfloat32_t v493 = svcmla_f32_x(pred_full, zero493, v1082, v440, 90); + svfloat32_t v521 = svadd_f32_x(svptrue_b32(), v520, v396); + svfloat32_t v539 = svmul_f32_x(svptrue_b32(), v520, v1084); + svfloat32_t zero561 = svdup_n_f32(0); + svfloat32_t v561 = svcmla_f32_x(pred_full, zero561, v1088, v526, 90); + svfloat32_t v610 = svadd_f32_x(svptrue_b32(), v609, v397); + svfloat32_t v432 = svadd_f32_x(svptrue_b32(), v431, v405); + svfloat32_t zero472 = svdup_n_f32(0); + svfloat32_t v472 = svcmla_f32_x(pred_full, zero472, v1079, v437, 90); + svfloat32_t v590 = svadd_f32_x(svptrue_b32(), v561, v568); + svfloat32_t v592 = svsub_f32_x(svptrue_b32(), v561, v568); + svfloat32_t v594 = svsub_f32_x(svptrue_b32(), v561, v575); + svfloat32_t zero625 = svdup_n_f32(0); + svfloat32_t v625 = svcmla_f32_x(pred_full, zero625, v1092, v610, 90); + svfloat32_t v681 = svmla_f32_x(pred_full, v663, v615, v1097); + svfloat32_t v683 = svnmls_f32_x(pred_full, v663, v615, v1097); + svfloat32_t v685 = svnmls_f32_x(pred_full, v668, v615, v1097); + svfloat32_t v494 = svmla_f32_x(pred_full, v432, v431, v1075); + svfloat32_t v501 = svadd_f32_x(svptrue_b32(), v472, v479); + svfloat32_t v503 = svsub_f32_x(svptrue_b32(), v472, v479); + svfloat32_t v505 = svsub_f32_x(svptrue_b32(), v472, v486); + svfloat32_t v583 = svmla_f32_x(pred_full, v539, v521, v1083); + svfloat32_t v591 = svadd_f32_x(svptrue_b32(), v590, v575); + svfloat32_t v593 = svsub_f32_x(svptrue_b32(), v592, v582); + svfloat32_t v595 = svadd_f32_x(svptrue_b32(), v594, v582); + svfloat32_t v674 = svcmla_f32_x(pred_full, v625, v1093, v609, 90); + svfloat32_t v682 = svmla_f32_x(pred_full, v681, v617, v1099); + svfloat32_t v684 = svmls_f32_x(pred_full, v683, v618, v1100); + svfloat32_t v686 = svmla_f32_x(pred_full, v685, v618, v1100); + svfloat32_t v693 = svmla_f32_x(pred_full, v432, v521, v1083); + svint16_t v698 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v432, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v495 = svmla_f32_x(pred_full, v494, v433, v1076); + svfloat32_t v497 = svmls_f32_x(pred_full, v494, v433, v1076); + svfloat32_t v499 = svmls_f32_x(pred_full, v494, v434, v1077); + svfloat32_t v502 = svadd_f32_x(svptrue_b32(), v501, v486); + svfloat32_t v504 = svsub_f32_x(svptrue_b32(), v503, v493); + svfloat32_t v506 = svadd_f32_x(svptrue_b32(), v505, v493); + svfloat32_t v584 = svmla_f32_x(pred_full, v583, v522, v1085); + svfloat32_t v586 = svmls_f32_x(pred_full, v583, v522, v1085); + svfloat32_t v588 = svmls_f32_x(pred_full, v583, v523, v1086); + svfloat32_t v675 = svadd_f32_x(svptrue_b32(), v674, v639); + svfloat32_t v677 = svsub_f32_x(svptrue_b32(), v674, v639); + svfloat32_t v679 = svsub_f32_x(svptrue_b32(), v674, v646); + svfloat32_t v694 = svadd_f32_x(svptrue_b32(), v693, v625); + svfloat32_t v695 = svsub_f32_x(svptrue_b32(), v693, v625); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1108), v1289, + svreinterpret_u64_s16(v698)); + svfloat32_t v496 = svmla_f32_x(pred_full, v495, v434, v1077); + svfloat32_t v498 = svmls_f32_x(pred_full, v497, v435, v1078); + svfloat32_t v500 = svmla_f32_x(pred_full, v499, v435, v1078); + svfloat32_t v585 = svmla_f32_x(pred_full, v584, v523, v1086); + svfloat32_t v587 = svmls_f32_x(pred_full, v586, v524, v1087); + svfloat32_t v589 = svmla_f32_x(pred_full, v588, v524, v1087); + svfloat32_t v676 = svadd_f32_x(svptrue_b32(), v675, v646); + svfloat32_t v678 = svsub_f32_x(svptrue_b32(), v677, v653); + svfloat32_t v680 = svadd_f32_x(svptrue_b32(), v679, v653); + svint16_t v706 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v695, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v714 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v694, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v507 = svadd_f32_x(svptrue_b32(), v496, v502); + svfloat32_t v508 = svsub_f32_x(svptrue_b32(), v496, v502); + svfloat32_t v509 = svadd_f32_x(svptrue_b32(), v498, v504); + svfloat32_t v510 = svsub_f32_x(svptrue_b32(), v498, v504); + svfloat32_t v511 = svadd_f32_x(svptrue_b32(), v500, v506); + svfloat32_t v512 = svsub_f32_x(svptrue_b32(), v500, v506); + svfloat32_t v596 = svadd_f32_x(svptrue_b32(), v585, v591); + svfloat32_t v597 = svsub_f32_x(svptrue_b32(), v585, v591); + svfloat32_t v598 = svadd_f32_x(svptrue_b32(), v587, v593); + svfloat32_t v599 = svsub_f32_x(svptrue_b32(), v587, v593); + svfloat32_t v600 = svadd_f32_x(svptrue_b32(), v589, v595); + svfloat32_t v601 = svsub_f32_x(svptrue_b32(), v589, v595); + svfloat32_t v687 = svadd_f32_x(svptrue_b32(), v676, v682); + svfloat32_t v688 = svsub_f32_x(svptrue_b32(), v676, v682); + svfloat32_t v689 = svadd_f32_x(svptrue_b32(), v678, v684); + svfloat32_t v690 = svsub_f32_x(svptrue_b32(), v678, v684); + svfloat32_t v691 = svadd_f32_x(svptrue_b32(), v680, v686); + svfloat32_t v692 = svsub_f32_x(svptrue_b32(), v680, v686); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1117), v1289, + svreinterpret_u64_s16(v706)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1126), v1289, + svreinterpret_u64_s16(v714)); + svfloat32_t v720 = svadd_f32_x(svptrue_b32(), v508, v597); + svint16_t v725 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v508, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v747 = svadd_f32_x(svptrue_b32(), v510, v599); + svint16_t v752 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v510, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v774 = svadd_f32_x(svptrue_b32(), v511, v600); + svint16_t v779 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v511, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v801 = svadd_f32_x(svptrue_b32(), v512, v601); + svint16_t v806 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v512, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v828 = svadd_f32_x(svptrue_b32(), v509, v598); + svint16_t v833 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v509, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v855 = svadd_f32_x(svptrue_b32(), v507, v596); + svint16_t v860 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v507, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v721 = svadd_f32_x(svptrue_b32(), v720, v688); + svfloat32_t v722 = svsub_f32_x(svptrue_b32(), v720, v688); + svfloat32_t v748 = svadd_f32_x(svptrue_b32(), v747, v690); + svfloat32_t v749 = svsub_f32_x(svptrue_b32(), v747, v690); + svfloat32_t v775 = svadd_f32_x(svptrue_b32(), v774, v691); + svfloat32_t v776 = svsub_f32_x(svptrue_b32(), v774, v691); + svfloat32_t v802 = svadd_f32_x(svptrue_b32(), v801, v692); + svfloat32_t v803 = svsub_f32_x(svptrue_b32(), v801, v692); + svfloat32_t v829 = svadd_f32_x(svptrue_b32(), v828, v689); + svfloat32_t v830 = svsub_f32_x(svptrue_b32(), v828, v689); + svfloat32_t v856 = svadd_f32_x(svptrue_b32(), v855, v687); + svfloat32_t v857 = svsub_f32_x(svptrue_b32(), v855, v687); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1135), v1289, + svreinterpret_u64_s16(v725)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1162), v1289, + svreinterpret_u64_s16(v752)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1189), v1289, + svreinterpret_u64_s16(v779)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1216), v1289, + svreinterpret_u64_s16(v806)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1243), v1289, + svreinterpret_u64_s16(v833)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1270), v1289, + svreinterpret_u64_s16(v860)); + svint16_t v733 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v722, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v741 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v721, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v760 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v749, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v768 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v748, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v787 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v776, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v795 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v775, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v814 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v803, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v822 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v802, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v841 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v830, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v849 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v829, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v868 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v857, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v876 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v856, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1144), v1289, + svreinterpret_u64_s16(v733)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1153), v1289, + svreinterpret_u64_s16(v741)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1171), v1289, + svreinterpret_u64_s16(v760)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1180), v1289, + svreinterpret_u64_s16(v768)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1198), v1289, + svreinterpret_u64_s16(v787)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1207), v1289, + svreinterpret_u64_s16(v795)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1225), v1289, + svreinterpret_u64_s16(v814)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1234), v1289, + svreinterpret_u64_s16(v822)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1252), v1289, + svreinterpret_u64_s16(v841)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1261), v1289, + svreinterpret_u64_s16(v849)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1279), v1289, + svreinterpret_u64_s16(v868)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1288), v1289, + svreinterpret_u64_s16(v876)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs22(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v379 = v5[istride]; + float v934 = 1.1000000000000001e+00F; + float v937 = 3.3166247903554003e-01F; + float v938 = -3.3166247903554003e-01F; + float v945 = 5.1541501300188641e-01F; + float v949 = 9.4125353283118118e-01F; + float v953 = 1.4143537075597825e+00F; + float v957 = 8.5949297361449750e-01F; + float v961 = 4.2314838273285138e-02F; + float v965 = 3.8639279888589606e-01F; + float v969 = 5.1254589567200015e-01F; + float v973 = 1.0702757469471715e+00F; + float v977 = 5.5486073394528512e-01F; + float v980 = 1.2412944743900585e+00F; + float v981 = -1.2412944743900585e+00F; + float v987 = 2.0897833842005756e-01F; + float v988 = -2.0897833842005756e-01F; + float v994 = 3.7415717312460811e-01F; + float v995 = -3.7415717312460811e-01F; + float v1001 = 4.9929922194110327e-02F; + float v1002 = -4.9929922194110327e-02F; + float v1008 = 6.5815896284539266e-01F; + float v1009 = -6.5815896284539266e-01F; + float v1015 = 6.3306543373877577e-01F; + float v1016 = -6.3306543373877577e-01F; + float v1022 = 1.0822460581641109e+00F; + float v1023 = -1.0822460581641109e+00F; + float v1029 = 8.1720737907134022e-01F; + float v1030 = -8.1720737907134022e-01F; + float v1036 = 4.2408709531871824e-01F; + float v1037 = -4.2408709531871824e-01F; + float32x2_t v1039 = (float32x2_t){v4, v4}; + float32x2_t v411 = vtrn1_f32(v379, v379); + float32x2_t v412 = vtrn2_f32(v379, v379); + float32x2_t v671 = v5[0]; + float32x2_t v935 = (float32x2_t){v934, v934}; + float32x2_t v939 = (float32x2_t){v937, v938}; + float32x2_t v946 = (float32x2_t){v945, v945}; + float32x2_t v950 = (float32x2_t){v949, v949}; + float32x2_t v954 = (float32x2_t){v953, v953}; + float32x2_t v958 = (float32x2_t){v957, v957}; + float32x2_t v962 = (float32x2_t){v961, v961}; + float32x2_t v966 = (float32x2_t){v965, v965}; + float32x2_t v970 = (float32x2_t){v969, v969}; + float32x2_t v974 = (float32x2_t){v973, v973}; + float32x2_t v978 = (float32x2_t){v977, v977}; + float32x2_t v982 = (float32x2_t){v980, v981}; + float32x2_t v989 = (float32x2_t){v987, v988}; + float32x2_t v996 = (float32x2_t){v994, v995}; + float32x2_t v1003 = (float32x2_t){v1001, v1002}; + float32x2_t v1010 = (float32x2_t){v1008, v1009}; + float32x2_t v1017 = (float32x2_t){v1015, v1016}; + float32x2_t v1024 = (float32x2_t){v1022, v1023}; + float32x2_t v1031 = (float32x2_t){v1029, v1030}; + float32x2_t v1038 = (float32x2_t){v1036, v1037}; + float32x2_t v20 = v5[istride * 11]; + int64_t v37 = 20 + j * 42; + float32x2_t v51 = v5[istride * 2]; + float32x2_t v69 = v5[istride * 13]; + int64_t v86 = 2 + j * 42; + int64_t v99 = 24 + j * 42; + float32x2_t v113 = v5[istride * 4]; + float32x2_t v131 = v5[istride * 15]; + int64_t v148 = 6 + j * 42; + int64_t v161 = 28 + j * 42; + float32x2_t v175 = v5[istride * 6]; + float32x2_t v193 = v5[istride * 17]; + int64_t v210 = 10 + j * 42; + int64_t v223 = 32 + j * 42; + float32x2_t v237 = v5[istride * 8]; + float32x2_t v255 = v5[istride * 19]; + int64_t v272 = 14 + j * 42; + int64_t v285 = 36 + j * 42; + float32x2_t v299 = v5[istride * 10]; + float32x2_t v317 = v5[istride * 21]; + int64_t v334 = 18 + j * 42; + int64_t v347 = 40 + j * 42; + float32x2_t v361 = v5[istride * 12]; + int64_t v396 = 22 + j * 42; + float32x2_t v410 = v7[j * 42]; + int64_t v414 = j * 42 + 1; + float32x2_t v423 = v5[istride * 14]; + float32x2_t v441 = v5[istride * 3]; + int64_t v458 = 26 + j * 42; + int64_t v471 = 4 + j * 42; + float32x2_t v485 = v5[istride * 16]; + float32x2_t v503 = v5[istride * 5]; + int64_t v520 = 30 + j * 42; + int64_t v533 = 8 + j * 42; + float32x2_t v547 = v5[istride * 18]; + float32x2_t v565 = v5[istride * 7]; + int64_t v582 = 34 + j * 42; + int64_t v595 = 12 + j * 42; + float32x2_t v609 = v5[istride * 20]; + float32x2_t v627 = v5[istride * 9]; + int64_t v644 = 38 + j * 42; + int64_t v657 = 16 + j * 42; + float32x2_t v941 = vmul_f32(v1039, v939); + float32x2_t v984 = vmul_f32(v1039, v982); + float32x2_t v991 = vmul_f32(v1039, v989); + float32x2_t v998 = vmul_f32(v1039, v996); + float32x2_t v1005 = vmul_f32(v1039, v1003); + float32x2_t v1012 = vmul_f32(v1039, v1010); + float32x2_t v1019 = vmul_f32(v1039, v1017); + float32x2_t v1026 = vmul_f32(v1039, v1024); + float32x2_t v1033 = vmul_f32(v1039, v1031); + float32x2_t v1040 = vmul_f32(v1039, v1038); + float32x2_t v38 = v7[v37]; + float32x2_t v39 = vtrn1_f32(v20, v20); + float32x2_t v40 = vtrn2_f32(v20, v20); + int64_t v42 = v37 + 1; + float32x2_t v87 = v7[v86]; + float32x2_t v88 = vtrn1_f32(v51, v51); + float32x2_t v89 = vtrn2_f32(v51, v51); + int64_t v91 = v86 + 1; + float32x2_t v100 = v7[v99]; + float32x2_t v101 = vtrn1_f32(v69, v69); + float32x2_t v102 = vtrn2_f32(v69, v69); + int64_t v104 = v99 + 1; + float32x2_t v149 = v7[v148]; + float32x2_t v150 = vtrn1_f32(v113, v113); + float32x2_t v151 = vtrn2_f32(v113, v113); + int64_t v153 = v148 + 1; + float32x2_t v162 = v7[v161]; + float32x2_t v163 = vtrn1_f32(v131, v131); + float32x2_t v164 = vtrn2_f32(v131, v131); + int64_t v166 = v161 + 1; + float32x2_t v211 = v7[v210]; + float32x2_t v212 = vtrn1_f32(v175, v175); + float32x2_t v213 = vtrn2_f32(v175, v175); + int64_t v215 = v210 + 1; + float32x2_t v224 = v7[v223]; + float32x2_t v225 = vtrn1_f32(v193, v193); + float32x2_t v226 = vtrn2_f32(v193, v193); + int64_t v228 = v223 + 1; + float32x2_t v273 = v7[v272]; + float32x2_t v274 = vtrn1_f32(v237, v237); + float32x2_t v275 = vtrn2_f32(v237, v237); + int64_t v277 = v272 + 1; + float32x2_t v286 = v7[v285]; + float32x2_t v287 = vtrn1_f32(v255, v255); + float32x2_t v288 = vtrn2_f32(v255, v255); + int64_t v290 = v285 + 1; + float32x2_t v335 = v7[v334]; + float32x2_t v336 = vtrn1_f32(v299, v299); + float32x2_t v337 = vtrn2_f32(v299, v299); + int64_t v339 = v334 + 1; + float32x2_t v348 = v7[v347]; + float32x2_t v349 = vtrn1_f32(v317, v317); + float32x2_t v350 = vtrn2_f32(v317, v317); + int64_t v352 = v347 + 1; + float32x2_t v397 = v7[v396]; + float32x2_t v398 = vtrn1_f32(v361, v361); + float32x2_t v399 = vtrn2_f32(v361, v361); + int64_t v401 = v396 + 1; + float32x2_t v415 = v7[v414]; + float32x2_t v416 = vmul_f32(v411, v410); + float32x2_t v459 = v7[v458]; + float32x2_t v460 = vtrn1_f32(v423, v423); + float32x2_t v461 = vtrn2_f32(v423, v423); + int64_t v463 = v458 + 1; + float32x2_t v472 = v7[v471]; + float32x2_t v473 = vtrn1_f32(v441, v441); + float32x2_t v474 = vtrn2_f32(v441, v441); + int64_t v476 = v471 + 1; + float32x2_t v521 = v7[v520]; + float32x2_t v522 = vtrn1_f32(v485, v485); + float32x2_t v523 = vtrn2_f32(v485, v485); + int64_t v525 = v520 + 1; + float32x2_t v534 = v7[v533]; + float32x2_t v535 = vtrn1_f32(v503, v503); + float32x2_t v536 = vtrn2_f32(v503, v503); + int64_t v538 = v533 + 1; + float32x2_t v583 = v7[v582]; + float32x2_t v584 = vtrn1_f32(v547, v547); + float32x2_t v585 = vtrn2_f32(v547, v547); + int64_t v587 = v582 + 1; + float32x2_t v596 = v7[v595]; + float32x2_t v597 = vtrn1_f32(v565, v565); + float32x2_t v598 = vtrn2_f32(v565, v565); + int64_t v600 = v595 + 1; + float32x2_t v645 = v7[v644]; + float32x2_t v646 = vtrn1_f32(v609, v609); + float32x2_t v647 = vtrn2_f32(v609, v609); + int64_t v649 = v644 + 1; + float32x2_t v658 = v7[v657]; + float32x2_t v659 = vtrn1_f32(v627, v627); + float32x2_t v660 = vtrn2_f32(v627, v627); + int64_t v662 = v657 + 1; + float32x2_t v43 = v7[v42]; + float32x2_t v44 = vmul_f32(v39, v38); + float32x2_t v92 = v7[v91]; + float32x2_t v93 = vmul_f32(v88, v87); + float32x2_t v105 = v7[v104]; + float32x2_t v106 = vmul_f32(v101, v100); + float32x2_t v154 = v7[v153]; + float32x2_t v155 = vmul_f32(v150, v149); + float32x2_t v167 = v7[v166]; + float32x2_t v168 = vmul_f32(v163, v162); + float32x2_t v216 = v7[v215]; + float32x2_t v217 = vmul_f32(v212, v211); + float32x2_t v229 = v7[v228]; + float32x2_t v230 = vmul_f32(v225, v224); + float32x2_t v278 = v7[v277]; + float32x2_t v279 = vmul_f32(v274, v273); + float32x2_t v291 = v7[v290]; + float32x2_t v292 = vmul_f32(v287, v286); + float32x2_t v340 = v7[v339]; + float32x2_t v341 = vmul_f32(v336, v335); + float32x2_t v353 = v7[v352]; + float32x2_t v354 = vmul_f32(v349, v348); + float32x2_t v402 = v7[v401]; + float32x2_t v403 = vmul_f32(v398, v397); + float32x2_t v464 = v7[v463]; + float32x2_t v465 = vmul_f32(v460, v459); + float32x2_t v477 = v7[v476]; + float32x2_t v478 = vmul_f32(v473, v472); + float32x2_t v526 = v7[v525]; + float32x2_t v527 = vmul_f32(v522, v521); + float32x2_t v539 = v7[v538]; + float32x2_t v540 = vmul_f32(v535, v534); + float32x2_t v588 = v7[v587]; + float32x2_t v589 = vmul_f32(v584, v583); + float32x2_t v601 = v7[v600]; + float32x2_t v602 = vmul_f32(v597, v596); + float32x2_t v650 = v7[v649]; + float32x2_t v651 = vmul_f32(v646, v645); + float32x2_t v663 = v7[v662]; + float32x2_t v664 = vmul_f32(v659, v658); + float32x2_t v418 = vfma_f32(v416, v412, v415); + float32x2_t v46 = vfma_f32(v44, v40, v43); + float32x2_t v95 = vfma_f32(v93, v89, v92); + float32x2_t v108 = vfma_f32(v106, v102, v105); + float32x2_t v157 = vfma_f32(v155, v151, v154); + float32x2_t v170 = vfma_f32(v168, v164, v167); + float32x2_t v219 = vfma_f32(v217, v213, v216); + float32x2_t v232 = vfma_f32(v230, v226, v229); + float32x2_t v281 = vfma_f32(v279, v275, v278); + float32x2_t v294 = vfma_f32(v292, v288, v291); + float32x2_t v343 = vfma_f32(v341, v337, v340); + float32x2_t v356 = vfma_f32(v354, v350, v353); + float32x2_t v405 = vfma_f32(v403, v399, v402); + float32x2_t v467 = vfma_f32(v465, v461, v464); + float32x2_t v480 = vfma_f32(v478, v474, v477); + float32x2_t v529 = vfma_f32(v527, v523, v526); + float32x2_t v542 = vfma_f32(v540, v536, v539); + float32x2_t v591 = vfma_f32(v589, v585, v588); + float32x2_t v604 = vfma_f32(v602, v598, v601); + float32x2_t v653 = vfma_f32(v651, v647, v650); + float32x2_t v666 = vfma_f32(v664, v660, v663); + float32x2_t v672 = vadd_f32(v671, v46); + float32x2_t v673 = vsub_f32(v671, v46); + float32x2_t v674 = vadd_f32(v95, v108); + float32x2_t v675 = vsub_f32(v95, v108); + float32x2_t v676 = vadd_f32(v157, v170); + float32x2_t v677 = vsub_f32(v157, v170); + float32x2_t v678 = vadd_f32(v219, v232); + float32x2_t v679 = vsub_f32(v219, v232); + float32x2_t v680 = vadd_f32(v281, v294); + float32x2_t v681 = vsub_f32(v281, v294); + float32x2_t v682 = vadd_f32(v343, v356); + float32x2_t v683 = vsub_f32(v343, v356); + float32x2_t v684 = vadd_f32(v405, v418); + float32x2_t v685 = vsub_f32(v405, v418); + float32x2_t v686 = vadd_f32(v467, v480); + float32x2_t v687 = vsub_f32(v467, v480); + float32x2_t v688 = vadd_f32(v529, v542); + float32x2_t v689 = vsub_f32(v529, v542); + float32x2_t v690 = vadd_f32(v591, v604); + float32x2_t v691 = vsub_f32(v591, v604); + float32x2_t v692 = vadd_f32(v653, v666); + float32x2_t v693 = vsub_f32(v653, v666); + float32x2_t v694 = vadd_f32(v674, v692); + float32x2_t v695 = vadd_f32(v676, v690); + float32x2_t v696 = vadd_f32(v678, v688); + float32x2_t v697 = vadd_f32(v680, v686); + float32x2_t v698 = vadd_f32(v682, v684); + float32x2_t v699 = vsub_f32(v674, v692); + float32x2_t v700 = vsub_f32(v676, v690); + float32x2_t v701 = vsub_f32(v678, v688); + float32x2_t v702 = vsub_f32(v680, v686); + float32x2_t v703 = vsub_f32(v682, v684); + float32x2_t v892 = vadd_f32(v675, v693); + float32x2_t v893 = vadd_f32(v677, v691); + float32x2_t v894 = vadd_f32(v679, v689); + float32x2_t v895 = vadd_f32(v681, v687); + float32x2_t v896 = vadd_f32(v683, v685); + float32x2_t v897 = vsub_f32(v675, v693); + float32x2_t v898 = vsub_f32(v677, v691); + float32x2_t v899 = vsub_f32(v679, v689); + float32x2_t v900 = vsub_f32(v681, v687); + float32x2_t v901 = vsub_f32(v683, v685); + float32x2_t v704 = vadd_f32(v694, v695); + float32x2_t v705 = vadd_f32(v696, v698); + float32x2_t v707 = vsub_f32(v700, v701); + float32x2_t v708 = vadd_f32(v699, v703); + float32x2_t v713 = vsub_f32(v695, v697); + float32x2_t v714 = vsub_f32(v694, v697); + float32x2_t v715 = vsub_f32(v695, v694); + float32x2_t v716 = vsub_f32(v698, v697); + float32x2_t v717 = vsub_f32(v696, v697); + float32x2_t v718 = vsub_f32(v698, v696); + float32x2_t v719 = vsub_f32(v695, v698); + float32x2_t v720 = vsub_f32(v694, v696); + float32x2_t v722 = vadd_f32(v700, v702); + float32x2_t v723 = vsub_f32(v699, v702); + float32x2_t v724 = vadd_f32(v699, v700); + float32x2_t v725 = vsub_f32(v702, v703); + float32x2_t v726 = vsub_f32(v701, v702); + float32x2_t v727 = vsub_f32(v701, v703); + float32x2_t v728 = vadd_f32(v700, v703); + float32x2_t v729 = vsub_f32(v699, v701); + float32x2_t v902 = vadd_f32(v892, v893); + float32x2_t v903 = vadd_f32(v894, v896); + float32x2_t v905 = vsub_f32(v898, v899); + float32x2_t v906 = vadd_f32(v897, v901); + float32x2_t v911 = vsub_f32(v893, v895); + float32x2_t v912 = vsub_f32(v892, v895); + float32x2_t v913 = vsub_f32(v893, v892); + float32x2_t v914 = vsub_f32(v896, v895); + float32x2_t v915 = vsub_f32(v894, v895); + float32x2_t v916 = vsub_f32(v896, v894); + float32x2_t v917 = vsub_f32(v893, v896); + float32x2_t v918 = vsub_f32(v892, v894); + float32x2_t v920 = vadd_f32(v898, v900); + float32x2_t v921 = vsub_f32(v897, v900); + float32x2_t v922 = vadd_f32(v897, v898); + float32x2_t v923 = vsub_f32(v900, v901); + float32x2_t v924 = vsub_f32(v899, v900); + float32x2_t v925 = vsub_f32(v899, v901); + float32x2_t v926 = vadd_f32(v898, v901); + float32x2_t v927 = vsub_f32(v897, v899); + float32x2_t v706 = vadd_f32(v697, v704); + float32x2_t v711 = vsub_f32(v707, v708); + float32x2_t v721 = vsub_f32(v705, v704); + float32x2_t v730 = vadd_f32(v707, v708); + float32x2_t v749 = vmul_f32(v713, v946); + float32x2_t v753 = vmul_f32(v714, v950); + float32x2_t v757 = vmul_f32(v715, v954); + float32x2_t v761 = vmul_f32(v716, v958); + float32x2_t v765 = vmul_f32(v717, v962); + float32x2_t v769 = vmul_f32(v718, v966); + float32x2_t v773 = vmul_f32(v719, v970); + float32x2_t v777 = vmul_f32(v720, v974); + float32x2_t v787 = vrev64_f32(v722); + float32x2_t v794 = vrev64_f32(v723); + float32x2_t v801 = vrev64_f32(v724); + float32x2_t v808 = vrev64_f32(v725); + float32x2_t v815 = vrev64_f32(v726); + float32x2_t v822 = vrev64_f32(v727); + float32x2_t v829 = vrev64_f32(v728); + float32x2_t v836 = vrev64_f32(v729); + float32x2_t v904 = vadd_f32(v895, v902); + float32x2_t v909 = vsub_f32(v905, v906); + float32x2_t v919 = vsub_f32(v903, v902); + float32x2_t v928 = vadd_f32(v905, v906); + float32x2_t v947 = vmul_f32(v911, v946); + float32x2_t v951 = vmul_f32(v912, v950); + float32x2_t v955 = vmul_f32(v913, v954); + float32x2_t v959 = vmul_f32(v914, v958); + float32x2_t v963 = vmul_f32(v915, v962); + float32x2_t v967 = vmul_f32(v916, v966); + float32x2_t v971 = vmul_f32(v917, v970); + float32x2_t v975 = vmul_f32(v918, v974); + float32x2_t v985 = vrev64_f32(v920); + float32x2_t v992 = vrev64_f32(v921); + float32x2_t v999 = vrev64_f32(v922); + float32x2_t v1006 = vrev64_f32(v923); + float32x2_t v1013 = vrev64_f32(v924); + float32x2_t v1020 = vrev64_f32(v925); + float32x2_t v1027 = vrev64_f32(v926); + float32x2_t v1034 = vrev64_f32(v927); + float32x2_t v709 = vadd_f32(v706, v705); + float32x2_t v712 = vsub_f32(v711, v702); + float32x2_t v781 = vmul_f32(v721, v978); + float32x2_t v788 = vmul_f32(v787, v984); + float32x2_t v795 = vmul_f32(v794, v991); + float32x2_t v802 = vmul_f32(v801, v998); + float32x2_t v809 = vmul_f32(v808, v1005); + float32x2_t v816 = vmul_f32(v815, v1012); + float32x2_t v823 = vmul_f32(v822, v1019); + float32x2_t v830 = vmul_f32(v829, v1026); + float32x2_t v837 = vmul_f32(v836, v1033); + float32x2_t v843 = vrev64_f32(v730); + float32x2_t v846 = vadd_f32(v749, v753); + float32x2_t v847 = vadd_f32(v753, v757); + float32x2_t v848 = vsub_f32(v749, v757); + float32x2_t v849 = vadd_f32(v761, v765); + float32x2_t v850 = vadd_f32(v765, v769); + float32x2_t v851 = vsub_f32(v761, v769); + float32x2_t v907 = vadd_f32(v904, v903); + float32x2_t v910 = vsub_f32(v909, v900); + float32x2_t v979 = vmul_f32(v919, v978); + float32x2_t v986 = vmul_f32(v985, v984); + float32x2_t v993 = vmul_f32(v992, v991); + float32x2_t v1000 = vmul_f32(v999, v998); + float32x2_t v1007 = vmul_f32(v1006, v1005); + float32x2_t v1014 = vmul_f32(v1013, v1012); + float32x2_t v1021 = vmul_f32(v1020, v1019); + float32x2_t v1028 = vmul_f32(v1027, v1026); + float32x2_t v1035 = vmul_f32(v1034, v1033); + float32x2_t v1041 = vrev64_f32(v928); + float32x2_t v1044 = vadd_f32(v947, v951); + float32x2_t v1045 = vadd_f32(v951, v955); + float32x2_t v1046 = vsub_f32(v947, v955); + float32x2_t v1047 = vadd_f32(v959, v963); + float32x2_t v1048 = vadd_f32(v963, v967); + float32x2_t v1049 = vsub_f32(v959, v967); + float32x2_t v710 = vadd_f32(v672, v709); + float32x2_t v738 = vmul_f32(v709, v935); + float32x2_t v744 = vrev64_f32(v712); + float32x2_t v844 = vmul_f32(v843, v1040); + float32x2_t v852 = vadd_f32(v777, v781); + float32x2_t v853 = vadd_f32(v773, v781); + float32x2_t v854 = vadd_f32(v795, v802); + float32x2_t v855 = vsub_f32(v788, v802); + float32x2_t v856 = vadd_f32(v816, v823); + float32x2_t v857 = vsub_f32(v809, v823); + float32x2_t v908 = vadd_f32(v673, v907); + float32x2_t v936 = vmul_f32(v907, v935); + float32x2_t v942 = vrev64_f32(v910); + float32x2_t v1042 = vmul_f32(v1041, v1040); + float32x2_t v1050 = vadd_f32(v975, v979); + float32x2_t v1051 = vadd_f32(v971, v979); + float32x2_t v1052 = vadd_f32(v993, v1000); + float32x2_t v1053 = vsub_f32(v986, v1000); + float32x2_t v1054 = vadd_f32(v1014, v1021); + float32x2_t v1055 = vsub_f32(v1007, v1021); + float32x2_t v745 = vmul_f32(v744, v941); + float32x2_t v845 = vsub_f32(v710, v738); + float32x2_t v858 = vadd_f32(v837, v844); + float32x2_t v859 = vsub_f32(v830, v844); + float32x2_t v860 = vadd_f32(v850, v852); + float32x2_t v878 = vadd_f32(v854, v855); + float32x2_t v943 = vmul_f32(v942, v941); + float32x2_t v1043 = vsub_f32(v908, v936); + float32x2_t v1056 = vadd_f32(v1035, v1042); + float32x2_t v1057 = vsub_f32(v1028, v1042); + float32x2_t v1058 = vadd_f32(v1048, v1050); + float32x2_t v1076 = vadd_f32(v1052, v1053); + int16x4_t v1092 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v710, 15), (int32x2_t){0, 0})); + int16x4_t v1098 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v908, 15), (int32x2_t){0, 0})); + float32x2_t v861 = vadd_f32(v860, v845); + float32x2_t v862 = vsub_f32(v845, v847); + float32x2_t v864 = vadd_f32(v845, v851); + float32x2_t v866 = vsub_f32(v845, v848); + float32x2_t v868 = vadd_f32(v845, v846); + float32x2_t v870 = vadd_f32(v745, v856); + float32x2_t v872 = vsub_f32(v858, v854); + float32x2_t v874 = vadd_f32(v745, v859); + float32x2_t v876 = vsub_f32(v859, v855); + float32x2_t v879 = vadd_f32(v878, v856); + float32x2_t v1059 = vadd_f32(v1058, v1043); + float32x2_t v1060 = vsub_f32(v1043, v1045); + float32x2_t v1062 = vadd_f32(v1043, v1049); + float32x2_t v1064 = vsub_f32(v1043, v1046); + float32x2_t v1066 = vadd_f32(v1043, v1044); + float32x2_t v1068 = vadd_f32(v943, v1054); + float32x2_t v1070 = vsub_f32(v1056, v1052); + float32x2_t v1072 = vadd_f32(v943, v1057); + float32x2_t v1074 = vsub_f32(v1057, v1053); + float32x2_t v1077 = vadd_f32(v1076, v1054); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v1092), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v1098), 0); + float32x2_t v863 = vsub_f32(v862, v852); + float32x2_t v865 = vadd_f32(v864, v853); + float32x2_t v867 = vsub_f32(v866, v853); + float32x2_t v869 = vsub_f32(v868, v849); + float32x2_t v871 = vadd_f32(v870, v858); + float32x2_t v873 = vsub_f32(v872, v745); + float32x2_t v875 = vadd_f32(v874, v857); + float32x2_t v877 = vsub_f32(v876, v745); + float32x2_t v880 = vadd_f32(v879, v857); + float32x2_t v1061 = vsub_f32(v1060, v1050); + float32x2_t v1063 = vadd_f32(v1062, v1051); + float32x2_t v1065 = vsub_f32(v1064, v1051); + float32x2_t v1067 = vsub_f32(v1066, v1047); + float32x2_t v1069 = vadd_f32(v1068, v1056); + float32x2_t v1071 = vsub_f32(v1070, v943); + float32x2_t v1073 = vadd_f32(v1072, v1055); + float32x2_t v1075 = vsub_f32(v1074, v943); + float32x2_t v1078 = vadd_f32(v1077, v1055); + float32x2_t v881 = vsub_f32(v880, v745); + float32x2_t v883 = vadd_f32(v861, v871); + float32x2_t v884 = vadd_f32(v863, v873); + float32x2_t v885 = vsub_f32(v865, v875); + float32x2_t v886 = vadd_f32(v867, v877); + float32x2_t v887 = vsub_f32(v867, v877); + float32x2_t v888 = vadd_f32(v865, v875); + float32x2_t v889 = vsub_f32(v863, v873); + float32x2_t v890 = vsub_f32(v861, v871); + float32x2_t v1079 = vsub_f32(v1078, v943); + float32x2_t v1081 = vadd_f32(v1059, v1069); + float32x2_t v1082 = vadd_f32(v1061, v1071); + float32x2_t v1083 = vsub_f32(v1063, v1073); + float32x2_t v1084 = vadd_f32(v1065, v1075); + float32x2_t v1085 = vsub_f32(v1065, v1075); + float32x2_t v1086 = vadd_f32(v1063, v1073); + float32x2_t v1087 = vsub_f32(v1061, v1071); + float32x2_t v1088 = vsub_f32(v1059, v1069); + float32x2_t v882 = vadd_f32(v869, v881); + float32x2_t v891 = vsub_f32(v869, v881); + float32x2_t v1080 = vadd_f32(v1067, v1079); + float32x2_t v1089 = vsub_f32(v1067, v1079); + int16x4_t v1116 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v890, 15), (int32x2_t){0, 0})); + int16x4_t v1122 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1088, 15), (int32x2_t){0, 0})); + int16x4_t v1128 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v889, 15), (int32x2_t){0, 0})); + int16x4_t v1134 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1087, 15), (int32x2_t){0, 0})); + int16x4_t v1140 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v888, 15), (int32x2_t){0, 0})); + int16x4_t v1146 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1086, 15), (int32x2_t){0, 0})); + int16x4_t v1152 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v887, 15), (int32x2_t){0, 0})); + int16x4_t v1158 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1085, 15), (int32x2_t){0, 0})); + int16x4_t v1164 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v886, 15), (int32x2_t){0, 0})); + int16x4_t v1170 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1084, 15), (int32x2_t){0, 0})); + int16x4_t v1176 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v885, 15), (int32x2_t){0, 0})); + int16x4_t v1182 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1083, 15), (int32x2_t){0, 0})); + int16x4_t v1188 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v884, 15), (int32x2_t){0, 0})); + int16x4_t v1194 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1082, 15), (int32x2_t){0, 0})); + int16x4_t v1200 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v883, 15), (int32x2_t){0, 0})); + int16x4_t v1206 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1081, 15), (int32x2_t){0, 0})); + int16x4_t v1104 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v891, 15), (int32x2_t){0, 0})); + int16x4_t v1110 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1089, 15), (int32x2_t){0, 0})); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v1116), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v1122), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v1128), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v1134), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v1140), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v1146), 0); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v1152), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v1158), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v1164), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v1170), 0); + v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v1176), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v1182), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v1188), 0); + v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v1194), 0); + v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v1200), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v1206), 0); + int16x4_t v1212 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v882, 15), (int32x2_t){0, 0})); + int16x4_t v1218 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1080, 15), (int32x2_t){0, 0})); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v1104), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v1110), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v1212), 0); + v6[ostride * 21] = vget_lane_s32(vreinterpret_s32_s16(v1218), 0); + v5 += 1 * idist; + v6 += 1 * odist; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs22(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + int64_t v3 = odist; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * v3; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + int64_t v13 = j; + float v740 = 1.1000000000000001e+00F; + float v745 = -3.3166247903554003e-01F; + float v752 = 5.1541501300188641e-01F; + float v757 = 9.4125353283118118e-01F; + float v762 = 1.4143537075597825e+00F; + float v767 = 8.5949297361449750e-01F; + float v772 = 4.2314838273285138e-02F; + float v777 = 3.8639279888589606e-01F; + float v782 = 5.1254589567200015e-01F; + float v787 = 1.0702757469471715e+00F; + float v792 = 5.5486073394528512e-01F; + float v797 = -1.2412944743900585e+00F; + float v804 = -2.0897833842005756e-01F; + float v811 = -3.7415717312460811e-01F; + float v818 = -4.9929922194110327e-02F; + float v825 = -6.5815896284539266e-01F; + float v832 = -6.3306543373877577e-01F; + float v839 = -1.0822460581641109e+00F; + float v846 = -8.1720737907134022e-01F; + float v853 = -4.2408709531871824e-01F; + const float32x2_t *v1196 = &v5[v0]; + int32_t *v1359 = &v6[v2]; + int64_t v19 = v0 * 11; + int64_t v34 = v10 * 10; + int64_t v40 = v0 * 2; + int64_t v54 = v0 * 13; + int64_t v76 = v10 * 12; + int64_t v82 = v0 * 4; + int64_t v96 = v0 * 15; + int64_t v111 = v10 * 3; + int64_t v118 = v10 * 14; + int64_t v124 = v0 * 6; + int64_t v138 = v0 * 17; + int64_t v153 = v10 * 5; + int64_t v160 = v10 * 16; + int64_t v166 = v0 * 8; + int64_t v180 = v0 * 19; + int64_t v195 = v10 * 7; + int64_t v202 = v10 * 18; + int64_t v208 = v0 * 10; + int64_t v222 = v0 * 21; + int64_t v237 = v10 * 9; + int64_t v244 = v10 * 20; + int64_t v250 = v0 * 12; + int64_t v279 = v10 * 11; + int64_t v292 = v0 * 14; + int64_t v306 = v0 * 3; + int64_t v321 = v10 * 13; + int64_t v328 = v10 * 2; + int64_t v334 = v0 * 16; + int64_t v348 = v0 * 5; + int64_t v363 = v10 * 15; + int64_t v370 = v10 * 4; + int64_t v376 = v0 * 18; + int64_t v390 = v0 * 7; + int64_t v405 = v10 * 17; + int64_t v412 = v10 * 6; + int64_t v418 = v0 * 20; + int64_t v432 = v0 * 9; + int64_t v447 = v10 * 19; + int64_t v454 = v10 * 8; + int64_t v455 = v13 * 21; + float v748 = v4 * v745; + float v800 = v4 * v797; + float v807 = v4 * v804; + float v814 = v4 * v811; + float v821 = v4 * v818; + float v828 = v4 * v825; + float v835 = v4 * v832; + float v842 = v4 * v839; + float v849 = v4 * v846; + float v856 = v4 * v853; + int64_t v915 = v2 * 11; + int64_t v923 = v2 * 12; + int64_t v939 = v2 * 2; + int64_t v947 = v2 * 13; + int64_t v955 = v2 * 14; + int64_t v963 = v2 * 3; + int64_t v971 = v2 * 4; + int64_t v979 = v2 * 15; + int64_t v987 = v2 * 16; + int64_t v995 = v2 * 5; + int64_t v1003 = v2 * 6; + int64_t v1011 = v2 * 17; + int64_t v1019 = v2 * 18; + int64_t v1027 = v2 * 7; + int64_t v1035 = v2 * 8; + int64_t v1043 = v2 * 19; + int64_t v1051 = v2 * 20; + int64_t v1059 = v2 * 9; + int64_t v1067 = v2 * 10; + int64_t v1075 = v2 * 21; + const float32x2_t *v1280 = &v5[0]; + svint64_t v1281 = svindex_s64(0, v1); + svfloat32_t v1305 = svdup_n_f32(v740); + svfloat32_t v1307 = svdup_n_f32(v752); + svfloat32_t v1308 = svdup_n_f32(v757); + svfloat32_t v1309 = svdup_n_f32(v762); + svfloat32_t v1310 = svdup_n_f32(v767); + svfloat32_t v1311 = svdup_n_f32(v772); + svfloat32_t v1312 = svdup_n_f32(v777); + svfloat32_t v1313 = svdup_n_f32(v782); + svfloat32_t v1314 = svdup_n_f32(v787); + svfloat32_t v1315 = svdup_n_f32(v792); + int32_t *v1332 = &v6[0]; + svint64_t v1522 = svindex_s64(0, v3); + int64_t v36 = v34 + v455; + int64_t v71 = v10 + v455; + int64_t v78 = v76 + v455; + int64_t v113 = v111 + v455; + int64_t v120 = v118 + v455; + int64_t v155 = v153 + v455; + int64_t v162 = v160 + v455; + int64_t v197 = v195 + v455; + int64_t v204 = v202 + v455; + int64_t v239 = v237 + v455; + int64_t v246 = v244 + v455; + int64_t v281 = v279 + v455; + svfloat32_t v289 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v455])); + int64_t v323 = v321 + v455; + int64_t v330 = v328 + v455; + int64_t v365 = v363 + v455; + int64_t v372 = v370 + v455; + int64_t v407 = v405 + v455; + int64_t v414 = v412 + v455; + int64_t v449 = v447 + v455; + int64_t v456 = v454 + v455; + const float32x2_t *v1088 = &v5[v19]; + const float32x2_t *v1097 = &v5[v40]; + const float32x2_t *v1106 = &v5[v54]; + const float32x2_t *v1115 = &v5[v82]; + const float32x2_t *v1124 = &v5[v96]; + const float32x2_t *v1133 = &v5[v124]; + const float32x2_t *v1142 = &v5[v138]; + const float32x2_t *v1151 = &v5[v166]; + const float32x2_t *v1160 = &v5[v180]; + const float32x2_t *v1169 = &v5[v208]; + const float32x2_t *v1178 = &v5[v222]; + const float32x2_t *v1187 = &v5[v250]; + svfloat32_t v1198 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1196), v1281)); + const float32x2_t *v1207 = &v5[v292]; + const float32x2_t *v1216 = &v5[v306]; + const float32x2_t *v1225 = &v5[v334]; + const float32x2_t *v1234 = &v5[v348]; + const float32x2_t *v1243 = &v5[v376]; + const float32x2_t *v1252 = &v5[v390]; + const float32x2_t *v1261 = &v5[v418]; + const float32x2_t *v1270 = &v5[v432]; + svfloat32_t v1282 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1280), v1281)); + svfloat32_t v1306 = svdup_n_f32(v748); + svfloat32_t v1316 = svdup_n_f32(v800); + svfloat32_t v1317 = svdup_n_f32(v807); + svfloat32_t v1318 = svdup_n_f32(v814); + svfloat32_t v1319 = svdup_n_f32(v821); + svfloat32_t v1320 = svdup_n_f32(v828); + svfloat32_t v1321 = svdup_n_f32(v835); + svfloat32_t v1322 = svdup_n_f32(v842); + svfloat32_t v1323 = svdup_n_f32(v849); + svfloat32_t v1324 = svdup_n_f32(v856); + int32_t *v1341 = &v6[v915]; + int32_t *v1350 = &v6[v923]; + int32_t *v1368 = &v6[v939]; + int32_t *v1377 = &v6[v947]; + int32_t *v1386 = &v6[v955]; + int32_t *v1395 = &v6[v963]; + int32_t *v1404 = &v6[v971]; + int32_t *v1413 = &v6[v979]; + int32_t *v1422 = &v6[v987]; + int32_t *v1431 = &v6[v995]; + int32_t *v1440 = &v6[v1003]; + int32_t *v1449 = &v6[v1011]; + int32_t *v1458 = &v6[v1019]; + int32_t *v1467 = &v6[v1027]; + int32_t *v1476 = &v6[v1035]; + int32_t *v1485 = &v6[v1043]; + int32_t *v1494 = &v6[v1051]; + int32_t *v1503 = &v6[v1059]; + int32_t *v1512 = &v6[v1067]; + int32_t *v1521 = &v6[v1075]; + svfloat32_t v37 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v36])); + svfloat32_t v72 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v71])); + svfloat32_t v79 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v78])); + svfloat32_t v114 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v113])); + svfloat32_t v121 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v120])); + svfloat32_t v156 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v155])); + svfloat32_t v163 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v162])); + svfloat32_t v198 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v197])); + svfloat32_t v205 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v204])); + svfloat32_t v240 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v239])); + svfloat32_t v247 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v246])); + svfloat32_t v282 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v281])); + svfloat32_t zero290 = svdup_n_f32(0); + svfloat32_t v290 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero290, v1198, v289, 0), v1198, + v289, 90); + svfloat32_t v324 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v323])); + svfloat32_t v331 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v330])); + svfloat32_t v366 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v365])); + svfloat32_t v373 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v372])); + svfloat32_t v408 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v407])); + svfloat32_t v415 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v414])); + svfloat32_t v450 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v449])); + svfloat32_t v457 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v456])); + svfloat32_t v1090 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1088), v1281)); + svfloat32_t v1099 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1097), v1281)); + svfloat32_t v1108 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1106), v1281)); + svfloat32_t v1117 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1115), v1281)); + svfloat32_t v1126 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1124), v1281)); + svfloat32_t v1135 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1133), v1281)); + svfloat32_t v1144 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1142), v1281)); + svfloat32_t v1153 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1151), v1281)); + svfloat32_t v1162 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1160), v1281)); + svfloat32_t v1171 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1169), v1281)); + svfloat32_t v1180 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1178), v1281)); + svfloat32_t v1189 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1187), v1281)); + svfloat32_t v1209 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1207), v1281)); + svfloat32_t v1218 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1216), v1281)); + svfloat32_t v1227 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1225), v1281)); + svfloat32_t v1236 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1234), v1281)); + svfloat32_t v1245 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1243), v1281)); + svfloat32_t v1254 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1252), v1281)); + svfloat32_t v1263 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1261), v1281)); + svfloat32_t v1272 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1270), v1281)); + svfloat32_t zero38 = svdup_n_f32(0); + svfloat32_t v38 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v1090, v37, 0), + v1090, v37, 90); + svfloat32_t zero73 = svdup_n_f32(0); + svfloat32_t v73 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v1099, v72, 0), + v1099, v72, 90); + svfloat32_t zero80 = svdup_n_f32(0); + svfloat32_t v80 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v1108, v79, 0), + v1108, v79, 90); + svfloat32_t zero115 = svdup_n_f32(0); + svfloat32_t v115 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero115, v1117, v114, 0), v1117, + v114, 90); + svfloat32_t zero122 = svdup_n_f32(0); + svfloat32_t v122 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero122, v1126, v121, 0), v1126, + v121, 90); + svfloat32_t zero157 = svdup_n_f32(0); + svfloat32_t v157 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero157, v1135, v156, 0), v1135, + v156, 90); + svfloat32_t zero164 = svdup_n_f32(0); + svfloat32_t v164 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero164, v1144, v163, 0), v1144, + v163, 90); + svfloat32_t zero199 = svdup_n_f32(0); + svfloat32_t v199 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero199, v1153, v198, 0), v1153, + v198, 90); + svfloat32_t zero206 = svdup_n_f32(0); + svfloat32_t v206 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero206, v1162, v205, 0), v1162, + v205, 90); + svfloat32_t zero241 = svdup_n_f32(0); + svfloat32_t v241 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero241, v1171, v240, 0), v1171, + v240, 90); + svfloat32_t zero248 = svdup_n_f32(0); + svfloat32_t v248 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero248, v1180, v247, 0), v1180, + v247, 90); + svfloat32_t zero283 = svdup_n_f32(0); + svfloat32_t v283 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero283, v1189, v282, 0), v1189, + v282, 90); + svfloat32_t zero325 = svdup_n_f32(0); + svfloat32_t v325 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero325, v1209, v324, 0), v1209, + v324, 90); + svfloat32_t zero332 = svdup_n_f32(0); + svfloat32_t v332 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero332, v1218, v331, 0), v1218, + v331, 90); + svfloat32_t zero367 = svdup_n_f32(0); + svfloat32_t v367 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero367, v1227, v366, 0), v1227, + v366, 90); + svfloat32_t zero374 = svdup_n_f32(0); + svfloat32_t v374 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero374, v1236, v373, 0), v1236, + v373, 90); + svfloat32_t zero409 = svdup_n_f32(0); + svfloat32_t v409 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero409, v1245, v408, 0), v1245, + v408, 90); + svfloat32_t zero416 = svdup_n_f32(0); + svfloat32_t v416 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero416, v1254, v415, 0), v1254, + v415, 90); + svfloat32_t zero451 = svdup_n_f32(0); + svfloat32_t v451 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero451, v1263, v450, 0), v1263, + v450, 90); + svfloat32_t zero458 = svdup_n_f32(0); + svfloat32_t v458 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero458, v1272, v457, 0), v1272, + v457, 90); + svfloat32_t v466 = svadd_f32_x(svptrue_b32(), v1282, v38); + svfloat32_t v467 = svsub_f32_x(svptrue_b32(), v1282, v38); + svfloat32_t v468 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v469 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v470 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v471 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v472 = svadd_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v473 = svsub_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v474 = svadd_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v475 = svsub_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v476 = svadd_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v477 = svsub_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v478 = svadd_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v479 = svsub_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v481 = svsub_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v482 = svadd_f32_x(svptrue_b32(), v367, v374); + svfloat32_t v483 = svsub_f32_x(svptrue_b32(), v367, v374); + svfloat32_t v484 = svadd_f32_x(svptrue_b32(), v409, v416); + svfloat32_t v485 = svsub_f32_x(svptrue_b32(), v409, v416); + svfloat32_t v486 = svadd_f32_x(svptrue_b32(), v451, v458); + svfloat32_t v487 = svsub_f32_x(svptrue_b32(), v451, v458); + svfloat32_t v488 = svadd_f32_x(svptrue_b32(), v468, v486); + svfloat32_t v489 = svadd_f32_x(svptrue_b32(), v470, v484); + svfloat32_t v490 = svadd_f32_x(svptrue_b32(), v472, v482); + svfloat32_t v491 = svadd_f32_x(svptrue_b32(), v474, v480); + svfloat32_t v492 = svadd_f32_x(svptrue_b32(), v476, v478); + svfloat32_t v493 = svsub_f32_x(svptrue_b32(), v468, v486); + svfloat32_t v494 = svsub_f32_x(svptrue_b32(), v470, v484); + svfloat32_t v495 = svsub_f32_x(svptrue_b32(), v472, v482); + svfloat32_t v496 = svsub_f32_x(svptrue_b32(), v474, v480); + svfloat32_t v497 = svsub_f32_x(svptrue_b32(), v476, v478); + svfloat32_t v697 = svadd_f32_x(svptrue_b32(), v469, v487); + svfloat32_t v698 = svadd_f32_x(svptrue_b32(), v471, v485); + svfloat32_t v699 = svadd_f32_x(svptrue_b32(), v473, v483); + svfloat32_t v700 = svadd_f32_x(svptrue_b32(), v475, v481); + svfloat32_t v701 = svadd_f32_x(svptrue_b32(), v477, v479); + svfloat32_t v702 = svsub_f32_x(svptrue_b32(), v469, v487); + svfloat32_t v703 = svsub_f32_x(svptrue_b32(), v471, v485); + svfloat32_t v704 = svsub_f32_x(svptrue_b32(), v473, v483); + svfloat32_t v705 = svsub_f32_x(svptrue_b32(), v475, v481); + svfloat32_t v706 = svsub_f32_x(svptrue_b32(), v477, v479); + svfloat32_t v498 = svadd_f32_x(svptrue_b32(), v488, v489); + svfloat32_t v499 = svadd_f32_x(svptrue_b32(), v490, v492); + svfloat32_t v501 = svsub_f32_x(svptrue_b32(), v494, v495); + svfloat32_t v502 = svadd_f32_x(svptrue_b32(), v493, v497); + svfloat32_t v507 = svsub_f32_x(svptrue_b32(), v489, v491); + svfloat32_t v508 = svsub_f32_x(svptrue_b32(), v488, v491); + svfloat32_t v509 = svsub_f32_x(svptrue_b32(), v489, v488); + svfloat32_t v510 = svsub_f32_x(svptrue_b32(), v492, v491); + svfloat32_t v511 = svsub_f32_x(svptrue_b32(), v490, v491); + svfloat32_t v512 = svsub_f32_x(svptrue_b32(), v492, v490); + svfloat32_t v513 = svsub_f32_x(svptrue_b32(), v489, v492); + svfloat32_t v514 = svsub_f32_x(svptrue_b32(), v488, v490); + svfloat32_t v516 = svadd_f32_x(svptrue_b32(), v494, v496); + svfloat32_t v517 = svsub_f32_x(svptrue_b32(), v493, v496); + svfloat32_t v518 = svadd_f32_x(svptrue_b32(), v493, v494); + svfloat32_t v519 = svsub_f32_x(svptrue_b32(), v496, v497); + svfloat32_t v520 = svsub_f32_x(svptrue_b32(), v495, v496); + svfloat32_t v521 = svsub_f32_x(svptrue_b32(), v495, v497); + svfloat32_t v522 = svadd_f32_x(svptrue_b32(), v494, v497); + svfloat32_t v523 = svsub_f32_x(svptrue_b32(), v493, v495); + svfloat32_t v707 = svadd_f32_x(svptrue_b32(), v697, v698); + svfloat32_t v708 = svadd_f32_x(svptrue_b32(), v699, v701); + svfloat32_t v710 = svsub_f32_x(svptrue_b32(), v703, v704); + svfloat32_t v711 = svadd_f32_x(svptrue_b32(), v702, v706); + svfloat32_t v716 = svsub_f32_x(svptrue_b32(), v698, v700); + svfloat32_t v717 = svsub_f32_x(svptrue_b32(), v697, v700); + svfloat32_t v718 = svsub_f32_x(svptrue_b32(), v698, v697); + svfloat32_t v719 = svsub_f32_x(svptrue_b32(), v701, v700); + svfloat32_t v720 = svsub_f32_x(svptrue_b32(), v699, v700); + svfloat32_t v721 = svsub_f32_x(svptrue_b32(), v701, v699); + svfloat32_t v722 = svsub_f32_x(svptrue_b32(), v698, v701); + svfloat32_t v723 = svsub_f32_x(svptrue_b32(), v697, v699); + svfloat32_t v725 = svadd_f32_x(svptrue_b32(), v703, v705); + svfloat32_t v726 = svsub_f32_x(svptrue_b32(), v702, v705); + svfloat32_t v727 = svadd_f32_x(svptrue_b32(), v702, v703); + svfloat32_t v728 = svsub_f32_x(svptrue_b32(), v705, v706); + svfloat32_t v729 = svsub_f32_x(svptrue_b32(), v704, v705); + svfloat32_t v730 = svsub_f32_x(svptrue_b32(), v704, v706); + svfloat32_t v731 = svadd_f32_x(svptrue_b32(), v703, v706); + svfloat32_t v732 = svsub_f32_x(svptrue_b32(), v702, v704); + svfloat32_t v500 = svadd_f32_x(svptrue_b32(), v491, v498); + svfloat32_t v505 = svsub_f32_x(svptrue_b32(), v501, v502); + svfloat32_t v515 = svsub_f32_x(svptrue_b32(), v499, v498); + svfloat32_t v524 = svadd_f32_x(svptrue_b32(), v501, v502); + svfloat32_t v551 = svmul_f32_x(svptrue_b32(), v508, v1308); + svfloat32_t v556 = svmul_f32_x(svptrue_b32(), v509, v1309); + svfloat32_t v566 = svmul_f32_x(svptrue_b32(), v511, v1311); + svfloat32_t v571 = svmul_f32_x(svptrue_b32(), v512, v1312); + svfloat32_t zero593 = svdup_n_f32(0); + svfloat32_t v593 = svcmla_f32_x(pred_full, zero593, v1316, v516, 90); + svfloat32_t zero607 = svdup_n_f32(0); + svfloat32_t v607 = svcmla_f32_x(pred_full, zero607, v1318, v518, 90); + svfloat32_t zero614 = svdup_n_f32(0); + svfloat32_t v614 = svcmla_f32_x(pred_full, zero614, v1319, v519, 90); + svfloat32_t zero628 = svdup_n_f32(0); + svfloat32_t v628 = svcmla_f32_x(pred_full, zero628, v1321, v521, 90); + svfloat32_t zero635 = svdup_n_f32(0); + svfloat32_t v635 = svcmla_f32_x(pred_full, zero635, v1322, v522, 90); + svfloat32_t v709 = svadd_f32_x(svptrue_b32(), v700, v707); + svfloat32_t v714 = svsub_f32_x(svptrue_b32(), v710, v711); + svfloat32_t v724 = svsub_f32_x(svptrue_b32(), v708, v707); + svfloat32_t v733 = svadd_f32_x(svptrue_b32(), v710, v711); + svfloat32_t v760 = svmul_f32_x(svptrue_b32(), v717, v1308); + svfloat32_t v765 = svmul_f32_x(svptrue_b32(), v718, v1309); + svfloat32_t v775 = svmul_f32_x(svptrue_b32(), v720, v1311); + svfloat32_t v780 = svmul_f32_x(svptrue_b32(), v721, v1312); + svfloat32_t zero802 = svdup_n_f32(0); + svfloat32_t v802 = svcmla_f32_x(pred_full, zero802, v1316, v725, 90); + svfloat32_t zero816 = svdup_n_f32(0); + svfloat32_t v816 = svcmla_f32_x(pred_full, zero816, v1318, v727, 90); + svfloat32_t zero823 = svdup_n_f32(0); + svfloat32_t v823 = svcmla_f32_x(pred_full, zero823, v1319, v728, 90); + svfloat32_t zero837 = svdup_n_f32(0); + svfloat32_t v837 = svcmla_f32_x(pred_full, zero837, v1321, v730, 90); + svfloat32_t zero844 = svdup_n_f32(0); + svfloat32_t v844 = svcmla_f32_x(pred_full, zero844, v1322, v731, 90); + svfloat32_t v503 = svadd_f32_x(svptrue_b32(), v500, v499); + svfloat32_t v506 = svsub_f32_x(svptrue_b32(), v505, v496); + svfloat32_t v586 = svmul_f32_x(svptrue_b32(), v515, v1315); + svfloat32_t zero649 = svdup_n_f32(0); + svfloat32_t v649 = svcmla_f32_x(pred_full, zero649, v1324, v524, 90); + svfloat32_t v651 = svmla_f32_x(pred_full, v551, v507, v1307); + svfloat32_t v652 = svmla_f32_x(pred_full, v556, v508, v1308); + svfloat32_t v653 = svnmls_f32_x(pred_full, v556, v507, v1307); + svfloat32_t v654 = svmla_f32_x(pred_full, v566, v510, v1310); + svfloat32_t v655 = svmla_f32_x(pred_full, v571, v511, v1311); + svfloat32_t v656 = svnmls_f32_x(pred_full, v571, v510, v1310); + svfloat32_t v659 = svcmla_f32_x(pred_full, v607, v1317, v517, 90); + svfloat32_t v660 = svsub_f32_x(svptrue_b32(), v593, v607); + svfloat32_t v661 = svcmla_f32_x(pred_full, v628, v1320, v520, 90); + svfloat32_t v662 = svsub_f32_x(svptrue_b32(), v614, v628); + svfloat32_t v712 = svadd_f32_x(svptrue_b32(), v709, v708); + svfloat32_t v715 = svsub_f32_x(svptrue_b32(), v714, v705); + svfloat32_t v795 = svmul_f32_x(svptrue_b32(), v724, v1315); + svfloat32_t zero858 = svdup_n_f32(0); + svfloat32_t v858 = svcmla_f32_x(pred_full, zero858, v1324, v733, 90); + svfloat32_t v860 = svmla_f32_x(pred_full, v760, v716, v1307); + svfloat32_t v861 = svmla_f32_x(pred_full, v765, v717, v1308); + svfloat32_t v862 = svnmls_f32_x(pred_full, v765, v716, v1307); + svfloat32_t v863 = svmla_f32_x(pred_full, v775, v719, v1310); + svfloat32_t v864 = svmla_f32_x(pred_full, v780, v720, v1311); + svfloat32_t v865 = svnmls_f32_x(pred_full, v780, v719, v1310); + svfloat32_t v868 = svcmla_f32_x(pred_full, v816, v1317, v726, 90); + svfloat32_t v869 = svsub_f32_x(svptrue_b32(), v802, v816); + svfloat32_t v870 = svcmla_f32_x(pred_full, v837, v1320, v729, 90); + svfloat32_t v871 = svsub_f32_x(svptrue_b32(), v823, v837); + svfloat32_t v504 = svadd_f32_x(svptrue_b32(), v466, v503); + svfloat32_t zero541 = svdup_n_f32(0); + svfloat32_t v541 = svcmla_f32_x(pred_full, zero541, v1306, v506, 90); + svfloat32_t v657 = svmla_f32_x(pred_full, v586, v514, v1314); + svfloat32_t v658 = svmla_f32_x(pred_full, v586, v513, v1313); + svfloat32_t v663 = svcmla_f32_x(pred_full, v649, v1323, v523, 90); + svfloat32_t v664 = svsub_f32_x(svptrue_b32(), v635, v649); + svfloat32_t v683 = svadd_f32_x(svptrue_b32(), v659, v660); + svfloat32_t v713 = svadd_f32_x(svptrue_b32(), v467, v712); + svfloat32_t zero750 = svdup_n_f32(0); + svfloat32_t v750 = svcmla_f32_x(pred_full, zero750, v1306, v715, 90); + svfloat32_t v866 = svmla_f32_x(pred_full, v795, v723, v1314); + svfloat32_t v867 = svmla_f32_x(pred_full, v795, v722, v1313); + svfloat32_t v872 = svcmla_f32_x(pred_full, v858, v1323, v732, 90); + svfloat32_t v873 = svsub_f32_x(svptrue_b32(), v844, v858); + svfloat32_t v892 = svadd_f32_x(svptrue_b32(), v868, v869); + svfloat32_t v650 = svmls_f32_x(pred_full, v504, v503, v1305); + svfloat32_t v665 = svadd_f32_x(svptrue_b32(), v655, v657); + svfloat32_t v675 = svadd_f32_x(svptrue_b32(), v541, v661); + svfloat32_t v677 = svsub_f32_x(svptrue_b32(), v663, v659); + svfloat32_t v679 = svadd_f32_x(svptrue_b32(), v541, v664); + svfloat32_t v681 = svsub_f32_x(svptrue_b32(), v664, v660); + svfloat32_t v684 = svadd_f32_x(svptrue_b32(), v683, v661); + svfloat32_t v859 = svmls_f32_x(pred_full, v713, v712, v1305); + svfloat32_t v874 = svadd_f32_x(svptrue_b32(), v864, v866); + svfloat32_t v884 = svadd_f32_x(svptrue_b32(), v750, v870); + svfloat32_t v886 = svsub_f32_x(svptrue_b32(), v872, v868); + svfloat32_t v888 = svadd_f32_x(svptrue_b32(), v750, v873); + svfloat32_t v890 = svsub_f32_x(svptrue_b32(), v873, v869); + svfloat32_t v893 = svadd_f32_x(svptrue_b32(), v892, v870); + svint16_t v908 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v504, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v916 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v713, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v666 = svadd_f32_x(svptrue_b32(), v665, v650); + svfloat32_t v667 = svsub_f32_x(svptrue_b32(), v650, v652); + svfloat32_t v669 = svadd_f32_x(svptrue_b32(), v650, v656); + svfloat32_t v671 = svsub_f32_x(svptrue_b32(), v650, v653); + svfloat32_t v673 = svadd_f32_x(svptrue_b32(), v650, v651); + svfloat32_t v676 = svadd_f32_x(svptrue_b32(), v675, v663); + svfloat32_t v678 = svsub_f32_x(svptrue_b32(), v677, v541); + svfloat32_t v680 = svadd_f32_x(svptrue_b32(), v679, v662); + svfloat32_t v682 = svsub_f32_x(svptrue_b32(), v681, v541); + svfloat32_t v685 = svadd_f32_x(svptrue_b32(), v684, v662); + svfloat32_t v875 = svadd_f32_x(svptrue_b32(), v874, v859); + svfloat32_t v876 = svsub_f32_x(svptrue_b32(), v859, v861); + svfloat32_t v878 = svadd_f32_x(svptrue_b32(), v859, v865); + svfloat32_t v880 = svsub_f32_x(svptrue_b32(), v859, v862); + svfloat32_t v882 = svadd_f32_x(svptrue_b32(), v859, v860); + svfloat32_t v885 = svadd_f32_x(svptrue_b32(), v884, v872); + svfloat32_t v887 = svsub_f32_x(svptrue_b32(), v886, v750); + svfloat32_t v889 = svadd_f32_x(svptrue_b32(), v888, v871); + svfloat32_t v891 = svsub_f32_x(svptrue_b32(), v890, v750); + svfloat32_t v894 = svadd_f32_x(svptrue_b32(), v893, v871); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1332), v1522, + svreinterpret_u64_s16(v908)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1341), v1522, + svreinterpret_u64_s16(v916)); + svfloat32_t v668 = svsub_f32_x(svptrue_b32(), v667, v657); + svfloat32_t v670 = svadd_f32_x(svptrue_b32(), v669, v658); + svfloat32_t v672 = svsub_f32_x(svptrue_b32(), v671, v658); + svfloat32_t v674 = svsub_f32_x(svptrue_b32(), v673, v654); + svfloat32_t v686 = svsub_f32_x(svptrue_b32(), v685, v541); + svfloat32_t v688 = svadd_f32_x(svptrue_b32(), v666, v676); + svfloat32_t v695 = svsub_f32_x(svptrue_b32(), v666, v676); + svfloat32_t v877 = svsub_f32_x(svptrue_b32(), v876, v866); + svfloat32_t v879 = svadd_f32_x(svptrue_b32(), v878, v867); + svfloat32_t v881 = svsub_f32_x(svptrue_b32(), v880, v867); + svfloat32_t v883 = svsub_f32_x(svptrue_b32(), v882, v863); + svfloat32_t v895 = svsub_f32_x(svptrue_b32(), v894, v750); + svfloat32_t v897 = svadd_f32_x(svptrue_b32(), v875, v885); + svfloat32_t v904 = svsub_f32_x(svptrue_b32(), v875, v885); + svfloat32_t v687 = svadd_f32_x(svptrue_b32(), v674, v686); + svfloat32_t v689 = svadd_f32_x(svptrue_b32(), v668, v678); + svfloat32_t v690 = svsub_f32_x(svptrue_b32(), v670, v680); + svfloat32_t v691 = svadd_f32_x(svptrue_b32(), v672, v682); + svfloat32_t v692 = svsub_f32_x(svptrue_b32(), v672, v682); + svfloat32_t v693 = svadd_f32_x(svptrue_b32(), v670, v680); + svfloat32_t v694 = svsub_f32_x(svptrue_b32(), v668, v678); + svfloat32_t v696 = svsub_f32_x(svptrue_b32(), v674, v686); + svfloat32_t v896 = svadd_f32_x(svptrue_b32(), v883, v895); + svfloat32_t v898 = svadd_f32_x(svptrue_b32(), v877, v887); + svfloat32_t v899 = svsub_f32_x(svptrue_b32(), v879, v889); + svfloat32_t v900 = svadd_f32_x(svptrue_b32(), v881, v891); + svfloat32_t v901 = svsub_f32_x(svptrue_b32(), v881, v891); + svfloat32_t v902 = svadd_f32_x(svptrue_b32(), v879, v889); + svfloat32_t v903 = svsub_f32_x(svptrue_b32(), v877, v887); + svfloat32_t v905 = svsub_f32_x(svptrue_b32(), v883, v895); + svint16_t v940 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v695, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v948 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v904, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1052 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v688, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1060 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v897, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v924 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v696, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v932 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v905, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v956 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v694, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v964 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v903, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v972 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v693, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v980 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v902, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v988 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v692, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v996 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v901, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1004 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v691, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1012 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v900, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1020 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v690, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1028 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v899, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1036 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v689, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1044 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v898, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1068 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v687, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1076 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v896, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1368), v1522, + svreinterpret_u64_s16(v940)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1377), v1522, + svreinterpret_u64_s16(v948)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1494), v1522, + svreinterpret_u64_s16(v1052)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1503), v1522, + svreinterpret_u64_s16(v1060)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1350), v1522, + svreinterpret_u64_s16(v924)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1359), v1522, + svreinterpret_u64_s16(v932)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1386), v1522, + svreinterpret_u64_s16(v956)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1395), v1522, + svreinterpret_u64_s16(v964)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1404), v1522, + svreinterpret_u64_s16(v972)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1413), v1522, + svreinterpret_u64_s16(v980)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1422), v1522, + svreinterpret_u64_s16(v988)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1431), v1522, + svreinterpret_u64_s16(v996)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1440), v1522, + svreinterpret_u64_s16(v1004)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1449), v1522, + svreinterpret_u64_s16(v1012)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1458), v1522, + svreinterpret_u64_s16(v1020)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1467), v1522, + svreinterpret_u64_s16(v1028)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1476), v1522, + svreinterpret_u64_s16(v1036)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1485), v1522, + svreinterpret_u64_s16(v1044)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1512), v1522, + svreinterpret_u64_s16(v1068)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1521), v1522, + svreinterpret_u64_s16(v1076)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs24(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v260 = v5[istride]; + float v706 = 1.0000000000000000e+00F; + float v707 = -1.0000000000000000e+00F; + float v714 = -7.0710678118654746e-01F; + float v721 = 7.0710678118654757e-01F; + float v773 = -1.4999999999999998e+00F; + float v774 = 1.4999999999999998e+00F; + float v781 = 1.0606601717798210e+00F; + float v788 = -1.0606601717798212e+00F; + float v842 = 8.6602540378443871e-01F; + float v850 = -8.6602540378443871e-01F; + float v857 = 6.1237243569579458e-01F; + float v858 = -6.1237243569579458e-01F; + float32x2_t v860 = (float32x2_t){v4, v4}; + float32x2_t v292 = vtrn1_f32(v260, v260); + float32x2_t v293 = vtrn2_f32(v260, v260); + float32x2_t v644 = v5[0]; + float32x2_t v708 = (float32x2_t){v706, v707}; + float32x2_t v715 = (float32x2_t){v721, v714}; + float32x2_t v722 = (float32x2_t){v721, v721}; + float32x2_t v771 = (float32x2_t){v773, v773}; + float32x2_t v775 = (float32x2_t){v773, v774}; + float32x2_t v782 = (float32x2_t){v788, v781}; + float32x2_t v789 = (float32x2_t){v788, v788}; + float32x2_t v844 = (float32x2_t){v842, v850}; + float32x2_t v851 = (float32x2_t){v850, v850}; + float32x2_t v855 = (float32x2_t){v858, v858}; + float32x2_t v859 = (float32x2_t){v857, v858}; + float32x2_t v20 = v5[istride * 8]; + float32x2_t v38 = v5[istride * 16]; + int64_t v55 = 14 + j * 46; + int64_t v68 = 30 + j * 46; + float32x2_t v82 = v5[istride * 11]; + float32x2_t v100 = v5[istride * 19]; + int64_t v117 = 20 + j * 46; + int64_t v130 = 36 + j * 46; + float32x2_t v144 = v5[istride * 3]; + int64_t v148 = 4 + j * 46; + float32x2_t v162 = v5[istride * 14]; + float32x2_t v180 = v5[istride * 22]; + int64_t v197 = 26 + j * 46; + int64_t v210 = 42 + j * 46; + float32x2_t v224 = v5[istride * 6]; + int64_t v228 = 10 + j * 46; + float32x2_t v242 = v5[istride * 17]; + int64_t v277 = 32 + j * 46; + float32x2_t v291 = v7[j * 46]; + int64_t v295 = j * 46 + 1; + float32x2_t v304 = v5[istride * 9]; + int64_t v308 = 16 + j * 46; + float32x2_t v322 = v5[istride * 20]; + float32x2_t v340 = v5[istride * 4]; + int64_t v357 = 38 + j * 46; + int64_t v370 = 6 + j * 46; + float32x2_t v384 = v5[istride * 12]; + int64_t v388 = 22 + j * 46; + float32x2_t v402 = v5[istride * 23]; + float32x2_t v420 = v5[istride * 7]; + int64_t v437 = 44 + j * 46; + int64_t v450 = 12 + j * 46; + float32x2_t v464 = v5[istride * 15]; + int64_t v468 = 28 + j * 46; + float32x2_t v482 = v5[istride * 2]; + float32x2_t v500 = v5[istride * 10]; + int64_t v517 = 2 + j * 46; + int64_t v530 = 18 + j * 46; + float32x2_t v544 = v5[istride * 18]; + int64_t v548 = 34 + j * 46; + float32x2_t v562 = v5[istride * 5]; + float32x2_t v580 = v5[istride * 13]; + int64_t v597 = 8 + j * 46; + int64_t v610 = 24 + j * 46; + float32x2_t v624 = v5[istride * 21]; + int64_t v628 = 40 + j * 46; + float32x2_t v710 = vmul_f32(v860, v708); + float32x2_t v717 = vmul_f32(v860, v715); + float32x2_t v777 = vmul_f32(v860, v775); + float32x2_t v784 = vmul_f32(v860, v782); + float32x2_t v846 = vmul_f32(v860, v844); + float32x2_t v861 = vmul_f32(v860, v859); + float32x2_t v56 = v7[v55]; + float32x2_t v57 = vtrn1_f32(v20, v20); + float32x2_t v58 = vtrn2_f32(v20, v20); + int64_t v60 = v55 + 1; + float32x2_t v69 = v7[v68]; + float32x2_t v70 = vtrn1_f32(v38, v38); + float32x2_t v71 = vtrn2_f32(v38, v38); + int64_t v73 = v68 + 1; + float32x2_t v118 = v7[v117]; + float32x2_t v119 = vtrn1_f32(v82, v82); + float32x2_t v120 = vtrn2_f32(v82, v82); + int64_t v122 = v117 + 1; + float32x2_t v131 = v7[v130]; + float32x2_t v132 = vtrn1_f32(v100, v100); + float32x2_t v133 = vtrn2_f32(v100, v100); + int64_t v135 = v130 + 1; + float32x2_t v149 = v7[v148]; + float32x2_t v150 = vtrn1_f32(v144, v144); + float32x2_t v151 = vtrn2_f32(v144, v144); + int64_t v153 = v148 + 1; + float32x2_t v198 = v7[v197]; + float32x2_t v199 = vtrn1_f32(v162, v162); + float32x2_t v200 = vtrn2_f32(v162, v162); + int64_t v202 = v197 + 1; + float32x2_t v211 = v7[v210]; + float32x2_t v212 = vtrn1_f32(v180, v180); + float32x2_t v213 = vtrn2_f32(v180, v180); + int64_t v215 = v210 + 1; + float32x2_t v229 = v7[v228]; + float32x2_t v230 = vtrn1_f32(v224, v224); + float32x2_t v231 = vtrn2_f32(v224, v224); + int64_t v233 = v228 + 1; + float32x2_t v278 = v7[v277]; + float32x2_t v279 = vtrn1_f32(v242, v242); + float32x2_t v280 = vtrn2_f32(v242, v242); + int64_t v282 = v277 + 1; + float32x2_t v296 = v7[v295]; + float32x2_t v297 = vmul_f32(v292, v291); + float32x2_t v309 = v7[v308]; + float32x2_t v310 = vtrn1_f32(v304, v304); + float32x2_t v311 = vtrn2_f32(v304, v304); + int64_t v313 = v308 + 1; + float32x2_t v358 = v7[v357]; + float32x2_t v359 = vtrn1_f32(v322, v322); + float32x2_t v360 = vtrn2_f32(v322, v322); + int64_t v362 = v357 + 1; + float32x2_t v371 = v7[v370]; + float32x2_t v372 = vtrn1_f32(v340, v340); + float32x2_t v373 = vtrn2_f32(v340, v340); + int64_t v375 = v370 + 1; + float32x2_t v389 = v7[v388]; + float32x2_t v390 = vtrn1_f32(v384, v384); + float32x2_t v391 = vtrn2_f32(v384, v384); + int64_t v393 = v388 + 1; + float32x2_t v438 = v7[v437]; + float32x2_t v439 = vtrn1_f32(v402, v402); + float32x2_t v440 = vtrn2_f32(v402, v402); + int64_t v442 = v437 + 1; + float32x2_t v451 = v7[v450]; + float32x2_t v452 = vtrn1_f32(v420, v420); + float32x2_t v453 = vtrn2_f32(v420, v420); + int64_t v455 = v450 + 1; + float32x2_t v469 = v7[v468]; + float32x2_t v470 = vtrn1_f32(v464, v464); + float32x2_t v471 = vtrn2_f32(v464, v464); + int64_t v473 = v468 + 1; + float32x2_t v518 = v7[v517]; + float32x2_t v519 = vtrn1_f32(v482, v482); + float32x2_t v520 = vtrn2_f32(v482, v482); + int64_t v522 = v517 + 1; + float32x2_t v531 = v7[v530]; + float32x2_t v532 = vtrn1_f32(v500, v500); + float32x2_t v533 = vtrn2_f32(v500, v500); + int64_t v535 = v530 + 1; + float32x2_t v549 = v7[v548]; + float32x2_t v550 = vtrn1_f32(v544, v544); + float32x2_t v551 = vtrn2_f32(v544, v544); + int64_t v553 = v548 + 1; + float32x2_t v598 = v7[v597]; + float32x2_t v599 = vtrn1_f32(v562, v562); + float32x2_t v600 = vtrn2_f32(v562, v562); + int64_t v602 = v597 + 1; + float32x2_t v611 = v7[v610]; + float32x2_t v612 = vtrn1_f32(v580, v580); + float32x2_t v613 = vtrn2_f32(v580, v580); + int64_t v615 = v610 + 1; + float32x2_t v629 = v7[v628]; + float32x2_t v630 = vtrn1_f32(v624, v624); + float32x2_t v631 = vtrn2_f32(v624, v624); + int64_t v633 = v628 + 1; + float32x2_t v61 = v7[v60]; + float32x2_t v62 = vmul_f32(v57, v56); + float32x2_t v74 = v7[v73]; + float32x2_t v75 = vmul_f32(v70, v69); + float32x2_t v123 = v7[v122]; + float32x2_t v124 = vmul_f32(v119, v118); + float32x2_t v136 = v7[v135]; + float32x2_t v137 = vmul_f32(v132, v131); + float32x2_t v154 = v7[v153]; + float32x2_t v155 = vmul_f32(v150, v149); + float32x2_t v203 = v7[v202]; + float32x2_t v204 = vmul_f32(v199, v198); + float32x2_t v216 = v7[v215]; + float32x2_t v217 = vmul_f32(v212, v211); + float32x2_t v234 = v7[v233]; + float32x2_t v235 = vmul_f32(v230, v229); + float32x2_t v283 = v7[v282]; + float32x2_t v284 = vmul_f32(v279, v278); + float32x2_t v314 = v7[v313]; + float32x2_t v315 = vmul_f32(v310, v309); + float32x2_t v363 = v7[v362]; + float32x2_t v364 = vmul_f32(v359, v358); + float32x2_t v376 = v7[v375]; + float32x2_t v377 = vmul_f32(v372, v371); + float32x2_t v394 = v7[v393]; + float32x2_t v395 = vmul_f32(v390, v389); + float32x2_t v443 = v7[v442]; + float32x2_t v444 = vmul_f32(v439, v438); + float32x2_t v456 = v7[v455]; + float32x2_t v457 = vmul_f32(v452, v451); + float32x2_t v474 = v7[v473]; + float32x2_t v475 = vmul_f32(v470, v469); + float32x2_t v523 = v7[v522]; + float32x2_t v524 = vmul_f32(v519, v518); + float32x2_t v536 = v7[v535]; + float32x2_t v537 = vmul_f32(v532, v531); + float32x2_t v554 = v7[v553]; + float32x2_t v555 = vmul_f32(v550, v549); + float32x2_t v603 = v7[v602]; + float32x2_t v604 = vmul_f32(v599, v598); + float32x2_t v616 = v7[v615]; + float32x2_t v617 = vmul_f32(v612, v611); + float32x2_t v634 = v7[v633]; + float32x2_t v635 = vmul_f32(v630, v629); + float32x2_t v299 = vfma_f32(v297, v293, v296); + float32x2_t v64 = vfma_f32(v62, v58, v61); + float32x2_t v77 = vfma_f32(v75, v71, v74); + float32x2_t v126 = vfma_f32(v124, v120, v123); + float32x2_t v139 = vfma_f32(v137, v133, v136); + float32x2_t v157 = vfma_f32(v155, v151, v154); + float32x2_t v206 = vfma_f32(v204, v200, v203); + float32x2_t v219 = vfma_f32(v217, v213, v216); + float32x2_t v237 = vfma_f32(v235, v231, v234); + float32x2_t v286 = vfma_f32(v284, v280, v283); + float32x2_t v317 = vfma_f32(v315, v311, v314); + float32x2_t v366 = vfma_f32(v364, v360, v363); + float32x2_t v379 = vfma_f32(v377, v373, v376); + float32x2_t v397 = vfma_f32(v395, v391, v394); + float32x2_t v446 = vfma_f32(v444, v440, v443); + float32x2_t v459 = vfma_f32(v457, v453, v456); + float32x2_t v477 = vfma_f32(v475, v471, v474); + float32x2_t v526 = vfma_f32(v524, v520, v523); + float32x2_t v539 = vfma_f32(v537, v533, v536); + float32x2_t v557 = vfma_f32(v555, v551, v554); + float32x2_t v606 = vfma_f32(v604, v600, v603); + float32x2_t v619 = vfma_f32(v617, v613, v616); + float32x2_t v637 = vfma_f32(v635, v631, v634); + float32x2_t v638 = vadd_f32(v64, v77); + float32x2_t v639 = vsub_f32(v64, v77); + float32x2_t v646 = vadd_f32(v126, v139); + float32x2_t v647 = vsub_f32(v126, v139); + float32x2_t v649 = vadd_f32(v206, v219); + float32x2_t v650 = vsub_f32(v206, v219); + float32x2_t v652 = vadd_f32(v286, v299); + float32x2_t v653 = vsub_f32(v286, v299); + float32x2_t v655 = vadd_f32(v366, v379); + float32x2_t v656 = vsub_f32(v366, v379); + float32x2_t v658 = vadd_f32(v446, v459); + float32x2_t v659 = vsub_f32(v446, v459); + float32x2_t v661 = vadd_f32(v526, v539); + float32x2_t v662 = vsub_f32(v526, v539); + float32x2_t v664 = vadd_f32(v606, v619); + float32x2_t v665 = vsub_f32(v606, v619); + float32x2_t v645 = vadd_f32(v638, v644); + float32x2_t v648 = vadd_f32(v646, v157); + float32x2_t v651 = vadd_f32(v649, v237); + float32x2_t v654 = vadd_f32(v652, v317); + float32x2_t v657 = vadd_f32(v655, v397); + float32x2_t v660 = vadd_f32(v658, v477); + float32x2_t v663 = vadd_f32(v661, v557); + float32x2_t v666 = vadd_f32(v664, v637); + float32x2_t v734 = vadd_f32(v638, v655); + float32x2_t v735 = vsub_f32(v638, v655); + float32x2_t v736 = vadd_f32(v649, v661); + float32x2_t v737 = vsub_f32(v649, v661); + float32x2_t v738 = vadd_f32(v646, v658); + float32x2_t v739 = vsub_f32(v646, v658); + float32x2_t v740 = vadd_f32(v652, v664); + float32x2_t v741 = vsub_f32(v652, v664); + float32x2_t v801 = vadd_f32(v639, v656); + float32x2_t v802 = vsub_f32(v639, v656); + float32x2_t v803 = vadd_f32(v650, v662); + float32x2_t v804 = vsub_f32(v650, v662); + float32x2_t v805 = vadd_f32(v647, v659); + float32x2_t v806 = vsub_f32(v647, v659); + float32x2_t v807 = vadd_f32(v653, v665); + float32x2_t v808 = vsub_f32(v653, v665); + float32x2_t v667 = vadd_f32(v645, v657); + float32x2_t v668 = vsub_f32(v645, v657); + float32x2_t v669 = vadd_f32(v651, v663); + float32x2_t v670 = vsub_f32(v651, v663); + float32x2_t v671 = vadd_f32(v648, v660); + float32x2_t v672 = vsub_f32(v648, v660); + float32x2_t v673 = vadd_f32(v654, v666); + float32x2_t v674 = vsub_f32(v654, v666); + float32x2_t v742 = vadd_f32(v734, v736); + float32x2_t v743 = vsub_f32(v734, v736); + float32x2_t v744 = vadd_f32(v738, v740); + float32x2_t v745 = vsub_f32(v738, v740); + float32x2_t v748 = vadd_f32(v739, v741); + float32x2_t v749 = vsub_f32(v739, v741); + float32x2_t v772 = vmul_f32(v735, v771); + float32x2_t v778 = vrev64_f32(v737); + float32x2_t v809 = vadd_f32(v801, v803); + float32x2_t v810 = vsub_f32(v801, v803); + float32x2_t v811 = vadd_f32(v805, v807); + float32x2_t v812 = vsub_f32(v805, v807); + float32x2_t v815 = vadd_f32(v806, v808); + float32x2_t v816 = vsub_f32(v806, v808); + float32x2_t v847 = vrev64_f32(v802); + float32x2_t v852 = vmul_f32(v804, v851); + float32x2_t v675 = vadd_f32(v667, v669); + float32x2_t v676 = vsub_f32(v667, v669); + float32x2_t v677 = vadd_f32(v671, v673); + float32x2_t v678 = vsub_f32(v671, v673); + float32x2_t v681 = vadd_f32(v672, v674); + float32x2_t v682 = vsub_f32(v672, v674); + float32x2_t v711 = vrev64_f32(v670); + float32x2_t v746 = vadd_f32(v742, v744); + float32x2_t v747 = vsub_f32(v742, v744); + float32x2_t v761 = vmul_f32(v743, v771); + float32x2_t v767 = vrev64_f32(v745); + float32x2_t v779 = vmul_f32(v778, v777); + float32x2_t v785 = vrev64_f32(v748); + float32x2_t v790 = vmul_f32(v749, v789); + float32x2_t v813 = vadd_f32(v809, v811); + float32x2_t v814 = vsub_f32(v809, v811); + float32x2_t v836 = vrev64_f32(v810); + float32x2_t v841 = vmul_f32(v812, v851); + float32x2_t v848 = vmul_f32(v847, v846); + float32x2_t v856 = vmul_f32(v815, v855); + float32x2_t v862 = vrev64_f32(v816); + float32x2_t v679 = vadd_f32(v675, v677); + float32x2_t v680 = vsub_f32(v675, v677); + float32x2_t v700 = vrev64_f32(v678); + float32x2_t v712 = vmul_f32(v711, v710); + float32x2_t v718 = vrev64_f32(v681); + float32x2_t v723 = vmul_f32(v682, v722); + float32x2_t v753 = vmul_f32(v746, v771); + float32x2_t v757 = vmul_f32(v747, v771); + float32x2_t v768 = vmul_f32(v767, v777); + float32x2_t v786 = vmul_f32(v785, v784); + float32x2_t v793 = vadd_f32(v772, v790); + float32x2_t v794 = vsub_f32(v772, v790); + float32x2_t v822 = vrev64_f32(v813); + float32x2_t v829 = vrev64_f32(v814); + float32x2_t v837 = vmul_f32(v836, v846); + float32x2_t v863 = vmul_f32(v862, v861); + float32x2_t v868 = vadd_f32(v852, v856); + float32x2_t v869 = vsub_f32(v852, v856); + float32x2_t v701 = vmul_f32(v700, v710); + float32x2_t v719 = vmul_f32(v718, v717); + float32x2_t v726 = vadd_f32(v668, v723); + float32x2_t v727 = vsub_f32(v668, v723); + float32x2_t v791 = vadd_f32(v761, v768); + float32x2_t v792 = vsub_f32(v761, v768); + float32x2_t v795 = vadd_f32(v779, v786); + float32x2_t v796 = vsub_f32(v779, v786); + float32x2_t v823 = vmul_f32(v822, v846); + float32x2_t v830 = vmul_f32(v829, v846); + float32x2_t v864 = vadd_f32(v837, v841); + float32x2_t v865 = vsub_f32(v837, v841); + float32x2_t v866 = vadd_f32(v848, v863); + float32x2_t v867 = vsub_f32(v848, v863); + float32x2_t v874 = vadd_f32(v679, v753); + int16x4_t v879 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v679, 15), (int32x2_t){0, 0})); + float32x2_t v958 = vadd_f32(v680, v757); + int16x4_t v963 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v680, 15), (int32x2_t){0, 0})); + float32x2_t v724 = vadd_f32(v676, v701); + float32x2_t v725 = vsub_f32(v676, v701); + float32x2_t v728 = vadd_f32(v712, v719); + float32x2_t v729 = vsub_f32(v712, v719); + float32x2_t v797 = vadd_f32(v793, v795); + float32x2_t v798 = vsub_f32(v793, v795); + float32x2_t v799 = vadd_f32(v794, v796); + float32x2_t v800 = vsub_f32(v794, v796); + float32x2_t v870 = vadd_f32(v866, v868); + float32x2_t v871 = vsub_f32(v866, v868); + float32x2_t v872 = vadd_f32(v867, v869); + float32x2_t v873 = vsub_f32(v867, v869); + float32x2_t v875 = vadd_f32(v874, v823); + float32x2_t v876 = vsub_f32(v874, v823); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v879), 0); + float32x2_t v959 = vadd_f32(v958, v830); + float32x2_t v960 = vsub_f32(v958, v830); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v963), 0); + float32x2_t v730 = vadd_f32(v726, v728); + float32x2_t v731 = vsub_f32(v726, v728); + float32x2_t v732 = vadd_f32(v727, v729); + float32x2_t v733 = vsub_f32(v727, v729); + int16x4_t v885 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v876, 15), (int32x2_t){0, 0})); + int16x4_t v891 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v875, 15), (int32x2_t){0, 0})); + float32x2_t v916 = vadd_f32(v725, v792); + int16x4_t v921 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v725, 15), (int32x2_t){0, 0})); + int16x4_t v969 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v960, 15), (int32x2_t){0, 0})); + int16x4_t v975 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v959, 15), (int32x2_t){0, 0})); + float32x2_t v1000 = vadd_f32(v724, v791); + int16x4_t v1005 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v724, 15), (int32x2_t){0, 0})); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v885), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v891), 0); + float32x2_t v895 = vadd_f32(v731, v798); + int16x4_t v900 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v731, 15), (int32x2_t){0, 0})); + float32x2_t v917 = vadd_f32(v916, v865); + float32x2_t v918 = vsub_f32(v916, v865); + v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v921), 0); + float32x2_t v937 = vadd_f32(v732, v799); + int16x4_t v942 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v732, 15), (int32x2_t){0, 0})); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v969), 0); + v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v975), 0); + float32x2_t v979 = vadd_f32(v733, v800); + int16x4_t v984 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v733, 15), (int32x2_t){0, 0})); + float32x2_t v1001 = vadd_f32(v1000, v864); + float32x2_t v1002 = vsub_f32(v1000, v864); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v1005), 0); + float32x2_t v1021 = vadd_f32(v730, v797); + int16x4_t v1026 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v730, 15), (int32x2_t){0, 0})); + float32x2_t v896 = vadd_f32(v895, v871); + float32x2_t v897 = vsub_f32(v895, v871); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v900), 0); + int16x4_t v927 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v918, 15), (int32x2_t){0, 0})); + int16x4_t v933 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v917, 15), (int32x2_t){0, 0})); + float32x2_t v938 = vadd_f32(v937, v872); + float32x2_t v939 = vsub_f32(v937, v872); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v942), 0); + float32x2_t v980 = vadd_f32(v979, v873); + float32x2_t v981 = vsub_f32(v979, v873); + v6[ostride * 21] = vget_lane_s32(vreinterpret_s32_s16(v984), 0); + int16x4_t v1011 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1002, 15), (int32x2_t){0, 0})); + int16x4_t v1017 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1001, 15), (int32x2_t){0, 0})); + float32x2_t v1022 = vadd_f32(v1021, v870); + float32x2_t v1023 = vsub_f32(v1021, v870); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v1026), 0); + int16x4_t v906 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v897, 15), (int32x2_t){0, 0})); + int16x4_t v912 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v896, 15), (int32x2_t){0, 0})); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v927), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v933), 0); + int16x4_t v948 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v939, 15), (int32x2_t){0, 0})); + int16x4_t v954 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v938, 15), (int32x2_t){0, 0})); + int16x4_t v990 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v981, 15), (int32x2_t){0, 0})); + int16x4_t v996 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v980, 15), (int32x2_t){0, 0})); + v6[ostride * 22] = vget_lane_s32(vreinterpret_s32_s16(v1011), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v1017), 0); + int16x4_t v1032 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1023, 15), (int32x2_t){0, 0})); + int16x4_t v1038 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1022, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v906), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v912), 0); + v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v948), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v954), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v990), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v996), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v1032), 0); + v6[ostride * 23] = vget_lane_s32(vreinterpret_s32_s16(v1038), 0); + v5 += 1 * idist; + v6 += 1 * odist; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs24(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + int64_t v3 = odist; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * v3; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + int64_t v13 = j; + float v527 = -1.0000000000000000e+00F; + float v534 = -7.0710678118654746e-01F; + float v541 = 7.0710678118654757e-01F; + float v594 = -1.4999999999999998e+00F; + float v599 = 1.4999999999999998e+00F; + float v606 = 1.0606601717798210e+00F; + float v613 = -1.0606601717798212e+00F; + float v677 = -8.6602540378443871e-01F; + float v687 = -6.1237243569579458e-01F; + const float32x2_t *v1006 = &v5[v0]; + int32_t *v1205 = &v6[v2]; + int64_t v19 = v0 * 8; + int64_t v33 = v0 * 16; + int64_t v48 = v10 * 7; + int64_t v55 = v10 * 15; + int64_t v61 = v0 * 11; + int64_t v75 = v0 * 19; + int64_t v90 = v10 * 10; + int64_t v97 = v10 * 18; + int64_t v103 = v0 * 3; + int64_t v111 = v10 * 2; + int64_t v117 = v0 * 14; + int64_t v131 = v0 * 22; + int64_t v146 = v10 * 13; + int64_t v153 = v10 * 21; + int64_t v159 = v0 * 6; + int64_t v167 = v10 * 5; + int64_t v173 = v0 * 17; + int64_t v202 = v10 * 16; + int64_t v215 = v0 * 9; + int64_t v223 = v10 * 8; + int64_t v229 = v0 * 20; + int64_t v243 = v0 * 4; + int64_t v258 = v10 * 19; + int64_t v265 = v10 * 3; + int64_t v271 = v0 * 12; + int64_t v279 = v10 * 11; + int64_t v285 = v0 * 23; + int64_t v299 = v0 * 7; + int64_t v314 = v10 * 22; + int64_t v321 = v10 * 6; + int64_t v327 = v0 * 15; + int64_t v335 = v10 * 14; + int64_t v341 = v0 * 2; + int64_t v355 = v0 * 10; + int64_t v377 = v10 * 9; + int64_t v383 = v0 * 18; + int64_t v391 = v10 * 17; + int64_t v397 = v0 * 5; + int64_t v411 = v0 * 13; + int64_t v426 = v10 * 4; + int64_t v433 = v10 * 12; + int64_t v439 = v0 * 21; + int64_t v447 = v10 * 20; + int64_t v448 = v13 * 23; + float v530 = v4 * v527; + float v537 = v4 * v534; + float v602 = v4 * v599; + float v609 = v4 * v606; + float v673 = v4 * v677; + float v690 = v4 * v687; + int64_t v715 = v2 * 16; + int64_t v723 = v2 * 8; + int64_t v734 = v2 * 9; + int64_t v750 = v2 * 17; + int64_t v761 = v2 * 18; + int64_t v769 = v2 * 10; + int64_t v777 = v2 * 2; + int64_t v788 = v2 * 3; + int64_t v796 = v2 * 19; + int64_t v804 = v2 * 11; + int64_t v815 = v2 * 12; + int64_t v823 = v2 * 4; + int64_t v831 = v2 * 20; + int64_t v842 = v2 * 21; + int64_t v850 = v2 * 13; + int64_t v858 = v2 * 5; + int64_t v869 = v2 * 6; + int64_t v877 = v2 * 22; + int64_t v885 = v2 * 14; + int64_t v896 = v2 * 15; + int64_t v904 = v2 * 7; + int64_t v912 = v2 * 23; + const float32x2_t *v1135 = &v5[0]; + svint64_t v1136 = svindex_s64(0, v1); + svfloat32_t v1145 = svdup_n_f32(v541); + svfloat32_t v1150 = svdup_n_f32(v594); + svfloat32_t v1153 = svdup_n_f32(v613); + svfloat32_t v1159 = svdup_n_f32(v677); + svfloat32_t v1160 = svdup_n_f32(v687); + int32_t *v1169 = &v6[0]; + svint64_t v1377 = svindex_s64(0, v3); + int64_t v50 = v48 + v448; + int64_t v57 = v55 + v448; + int64_t v92 = v90 + v448; + int64_t v99 = v97 + v448; + int64_t v113 = v111 + v448; + int64_t v148 = v146 + v448; + int64_t v155 = v153 + v448; + int64_t v169 = v167 + v448; + int64_t v204 = v202 + v448; + svfloat32_t v212 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v448])); + int64_t v225 = v223 + v448; + int64_t v260 = v258 + v448; + int64_t v267 = v265 + v448; + int64_t v281 = v279 + v448; + int64_t v316 = v314 + v448; + int64_t v323 = v321 + v448; + int64_t v337 = v335 + v448; + int64_t v372 = v10 + v448; + int64_t v379 = v377 + v448; + int64_t v393 = v391 + v448; + int64_t v428 = v426 + v448; + int64_t v435 = v433 + v448; + int64_t v449 = v447 + v448; + const float32x2_t *v925 = &v5[v19]; + const float32x2_t *v934 = &v5[v33]; + const float32x2_t *v943 = &v5[v61]; + const float32x2_t *v952 = &v5[v75]; + const float32x2_t *v961 = &v5[v103]; + const float32x2_t *v970 = &v5[v117]; + const float32x2_t *v979 = &v5[v131]; + const float32x2_t *v988 = &v5[v159]; + const float32x2_t *v997 = &v5[v173]; + svfloat32_t v1008 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1006), v1136)); + const float32x2_t *v1017 = &v5[v215]; + const float32x2_t *v1026 = &v5[v229]; + const float32x2_t *v1035 = &v5[v243]; + const float32x2_t *v1044 = &v5[v271]; + const float32x2_t *v1053 = &v5[v285]; + const float32x2_t *v1062 = &v5[v299]; + const float32x2_t *v1071 = &v5[v327]; + const float32x2_t *v1080 = &v5[v341]; + const float32x2_t *v1089 = &v5[v355]; + const float32x2_t *v1098 = &v5[v383]; + const float32x2_t *v1107 = &v5[v397]; + const float32x2_t *v1116 = &v5[v411]; + const float32x2_t *v1125 = &v5[v439]; + svfloat32_t v1137 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1135), v1136)); + svfloat32_t v1143 = svdup_n_f32(v530); + svfloat32_t v1144 = svdup_n_f32(v537); + svfloat32_t v1151 = svdup_n_f32(v602); + svfloat32_t v1152 = svdup_n_f32(v609); + svfloat32_t v1158 = svdup_n_f32(v673); + svfloat32_t v1161 = svdup_n_f32(v690); + int32_t *v1178 = &v6[v715]; + int32_t *v1187 = &v6[v723]; + int32_t *v1196 = &v6[v734]; + int32_t *v1214 = &v6[v750]; + int32_t *v1223 = &v6[v761]; + int32_t *v1232 = &v6[v769]; + int32_t *v1241 = &v6[v777]; + int32_t *v1250 = &v6[v788]; + int32_t *v1259 = &v6[v796]; + int32_t *v1268 = &v6[v804]; + int32_t *v1277 = &v6[v815]; + int32_t *v1286 = &v6[v823]; + int32_t *v1295 = &v6[v831]; + int32_t *v1304 = &v6[v842]; + int32_t *v1313 = &v6[v850]; + int32_t *v1322 = &v6[v858]; + int32_t *v1331 = &v6[v869]; + int32_t *v1340 = &v6[v877]; + int32_t *v1349 = &v6[v885]; + int32_t *v1358 = &v6[v896]; + int32_t *v1367 = &v6[v904]; + int32_t *v1376 = &v6[v912]; + svfloat32_t v51 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v50])); + svfloat32_t v58 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v57])); + svfloat32_t v93 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v92])); + svfloat32_t v100 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v99])); + svfloat32_t v114 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v113])); + svfloat32_t v149 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v148])); + svfloat32_t v156 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v155])); + svfloat32_t v170 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v169])); + svfloat32_t v205 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v204])); + svfloat32_t zero213 = svdup_n_f32(0); + svfloat32_t v213 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero213, v1008, v212, 0), v1008, + v212, 90); + svfloat32_t v226 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v225])); + svfloat32_t v261 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v260])); + svfloat32_t v268 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v267])); + svfloat32_t v282 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v281])); + svfloat32_t v317 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v316])); + svfloat32_t v324 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v323])); + svfloat32_t v338 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v337])); + svfloat32_t v373 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v372])); + svfloat32_t v380 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v379])); + svfloat32_t v394 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v393])); + svfloat32_t v429 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v428])); + svfloat32_t v436 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v435])); + svfloat32_t v450 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v449])); + svfloat32_t v927 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v925), v1136)); + svfloat32_t v936 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v934), v1136)); + svfloat32_t v945 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v943), v1136)); + svfloat32_t v954 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v952), v1136)); + svfloat32_t v963 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v961), v1136)); + svfloat32_t v972 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v970), v1136)); + svfloat32_t v981 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v979), v1136)); + svfloat32_t v990 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v988), v1136)); + svfloat32_t v999 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v997), v1136)); + svfloat32_t v1019 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1017), v1136)); + svfloat32_t v1028 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1026), v1136)); + svfloat32_t v1037 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1035), v1136)); + svfloat32_t v1046 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1044), v1136)); + svfloat32_t v1055 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1053), v1136)); + svfloat32_t v1064 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1062), v1136)); + svfloat32_t v1073 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1071), v1136)); + svfloat32_t v1082 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1080), v1136)); + svfloat32_t v1091 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1089), v1136)); + svfloat32_t v1100 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1098), v1136)); + svfloat32_t v1109 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1107), v1136)); + svfloat32_t v1118 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1116), v1136)); + svfloat32_t v1127 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1125), v1136)); + svfloat32_t zero52 = svdup_n_f32(0); + svfloat32_t v52 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v927, v51, 0), + v927, v51, 90); + svfloat32_t zero59 = svdup_n_f32(0); + svfloat32_t v59 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v936, v58, 0), + v936, v58, 90); + svfloat32_t zero94 = svdup_n_f32(0); + svfloat32_t v94 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v945, v93, 0), + v945, v93, 90); + svfloat32_t zero101 = svdup_n_f32(0); + svfloat32_t v101 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v954, v100, 0), + v954, v100, 90); + svfloat32_t zero150 = svdup_n_f32(0); + svfloat32_t v150 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero150, v972, v149, 0), + v972, v149, 90); + svfloat32_t zero157 = svdup_n_f32(0); + svfloat32_t v157 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v981, v156, 0), + v981, v156, 90); + svfloat32_t zero206 = svdup_n_f32(0); + svfloat32_t v206 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v999, v205, 0), + v999, v205, 90); + svfloat32_t zero262 = svdup_n_f32(0); + svfloat32_t v262 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero262, v1028, v261, 0), v1028, + v261, 90); + svfloat32_t zero269 = svdup_n_f32(0); + svfloat32_t v269 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero269, v1037, v268, 0), v1037, + v268, 90); + svfloat32_t zero318 = svdup_n_f32(0); + svfloat32_t v318 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero318, v1055, v317, 0), v1055, + v317, 90); + svfloat32_t zero325 = svdup_n_f32(0); + svfloat32_t v325 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero325, v1064, v324, 0), v1064, + v324, 90); + svfloat32_t zero374 = svdup_n_f32(0); + svfloat32_t v374 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero374, v1082, v373, 0), v1082, + v373, 90); + svfloat32_t zero381 = svdup_n_f32(0); + svfloat32_t v381 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero381, v1091, v380, 0), v1091, + v380, 90); + svfloat32_t zero430 = svdup_n_f32(0); + svfloat32_t v430 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero430, v1109, v429, 0), v1109, + v429, 90); + svfloat32_t zero437 = svdup_n_f32(0); + svfloat32_t v437 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero437, v1118, v436, 0), v1118, + v436, 90); + svfloat32_t v452 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v453 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v462 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v463 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v465 = svadd_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v466 = svsub_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v468 = svadd_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v469 = svsub_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v471 = svadd_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v472 = svsub_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v474 = svadd_f32_x(svptrue_b32(), v318, v325); + svfloat32_t v475 = svsub_f32_x(svptrue_b32(), v318, v325); + svfloat32_t v477 = svadd_f32_x(svptrue_b32(), v374, v381); + svfloat32_t v478 = svsub_f32_x(svptrue_b32(), v374, v381); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v430, v437); + svfloat32_t v481 = svsub_f32_x(svptrue_b32(), v430, v437); + svfloat32_t v461 = svadd_f32_x(svptrue_b32(), v452, v1137); + svfloat32_t v464 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v462, v963, v114, 0), + v963, v114, 90); + svfloat32_t v467 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v465, v990, v170, 0), + v990, v170, 90); + svfloat32_t v470 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v468, v1019, v226, 0), + v1019, v226, 90); + svfloat32_t v473 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v471, v1046, v282, 0), + v1046, v282, 90); + svfloat32_t v476 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v474, v1073, v338, 0), + v1073, v338, 90); + svfloat32_t v479 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v477, v1100, v394, 0), + v1100, v394, 90); + svfloat32_t v482 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v480, v1127, v450, 0), + v1127, v450, 90); + svfloat32_t v555 = svadd_f32_x(svptrue_b32(), v452, v471); + svfloat32_t v556 = svsub_f32_x(svptrue_b32(), v452, v471); + svfloat32_t v557 = svadd_f32_x(svptrue_b32(), v465, v477); + svfloat32_t v558 = svsub_f32_x(svptrue_b32(), v465, v477); + svfloat32_t v559 = svadd_f32_x(svptrue_b32(), v462, v474); + svfloat32_t v560 = svsub_f32_x(svptrue_b32(), v462, v474); + svfloat32_t v561 = svadd_f32_x(svptrue_b32(), v468, v480); + svfloat32_t v562 = svsub_f32_x(svptrue_b32(), v468, v480); + svfloat32_t v627 = svadd_f32_x(svptrue_b32(), v453, v472); + svfloat32_t v628 = svsub_f32_x(svptrue_b32(), v453, v472); + svfloat32_t v629 = svadd_f32_x(svptrue_b32(), v466, v478); + svfloat32_t v630 = svsub_f32_x(svptrue_b32(), v466, v478); + svfloat32_t v631 = svadd_f32_x(svptrue_b32(), v463, v475); + svfloat32_t v632 = svsub_f32_x(svptrue_b32(), v463, v475); + svfloat32_t v633 = svadd_f32_x(svptrue_b32(), v469, v481); + svfloat32_t v634 = svsub_f32_x(svptrue_b32(), v469, v481); + svfloat32_t v483 = svadd_f32_x(svptrue_b32(), v461, v473); + svfloat32_t v484 = svsub_f32_x(svptrue_b32(), v461, v473); + svfloat32_t v485 = svadd_f32_x(svptrue_b32(), v467, v479); + svfloat32_t v486 = svsub_f32_x(svptrue_b32(), v467, v479); + svfloat32_t v487 = svadd_f32_x(svptrue_b32(), v464, v476); + svfloat32_t v488 = svsub_f32_x(svptrue_b32(), v464, v476); + svfloat32_t v489 = svadd_f32_x(svptrue_b32(), v470, v482); + svfloat32_t v490 = svsub_f32_x(svptrue_b32(), v470, v482); + svfloat32_t v563 = svadd_f32_x(svptrue_b32(), v555, v557); + svfloat32_t v564 = svsub_f32_x(svptrue_b32(), v555, v557); + svfloat32_t v565 = svadd_f32_x(svptrue_b32(), v559, v561); + svfloat32_t v566 = svsub_f32_x(svptrue_b32(), v559, v561); + svfloat32_t v569 = svadd_f32_x(svptrue_b32(), v560, v562); + svfloat32_t v570 = svsub_f32_x(svptrue_b32(), v560, v562); + svfloat32_t zero604 = svdup_n_f32(0); + svfloat32_t v604 = svcmla_f32_x(pred_full, zero604, v1151, v558, 90); + svfloat32_t v635 = svadd_f32_x(svptrue_b32(), v627, v629); + svfloat32_t v636 = svsub_f32_x(svptrue_b32(), v627, v629); + svfloat32_t v637 = svadd_f32_x(svptrue_b32(), v631, v633); + svfloat32_t v638 = svsub_f32_x(svptrue_b32(), v631, v633); + svfloat32_t v641 = svadd_f32_x(svptrue_b32(), v632, v634); + svfloat32_t v642 = svsub_f32_x(svptrue_b32(), v632, v634); + svfloat32_t zero675 = svdup_n_f32(0); + svfloat32_t v675 = svcmla_f32_x(pred_full, zero675, v1158, v628, 90); + svfloat32_t v491 = svadd_f32_x(svptrue_b32(), v483, v485); + svfloat32_t v492 = svsub_f32_x(svptrue_b32(), v483, v485); + svfloat32_t v493 = svadd_f32_x(svptrue_b32(), v487, v489); + svfloat32_t v494 = svsub_f32_x(svptrue_b32(), v487, v489); + svfloat32_t v497 = svadd_f32_x(svptrue_b32(), v488, v490); + svfloat32_t v498 = svsub_f32_x(svptrue_b32(), v488, v490); + svfloat32_t zero532 = svdup_n_f32(0); + svfloat32_t v532 = svcmla_f32_x(pred_full, zero532, v1143, v486, 90); + svfloat32_t v567 = svadd_f32_x(svptrue_b32(), v563, v565); + svfloat32_t v568 = svsub_f32_x(svptrue_b32(), v563, v565); + svfloat32_t zero592 = svdup_n_f32(0); + svfloat32_t v592 = svcmla_f32_x(pred_full, zero592, v1151, v566, 90); + svfloat32_t zero611 = svdup_n_f32(0); + svfloat32_t v611 = svcmla_f32_x(pred_full, zero611, v1152, v569, 90); + svfloat32_t v616 = svmul_f32_x(svptrue_b32(), v570, v1153); + svfloat32_t v639 = svadd_f32_x(svptrue_b32(), v635, v637); + svfloat32_t v640 = svsub_f32_x(svptrue_b32(), v635, v637); + svfloat32_t zero663 = svdup_n_f32(0); + svfloat32_t v663 = svcmla_f32_x(pred_full, zero663, v1158, v636, 90); + svfloat32_t v685 = svmul_f32_x(svptrue_b32(), v641, v1160); + svfloat32_t zero692 = svdup_n_f32(0); + svfloat32_t v692 = svcmla_f32_x(pred_full, zero692, v1161, v642, 90); + svfloat32_t v495 = svadd_f32_x(svptrue_b32(), v491, v493); + svfloat32_t v496 = svsub_f32_x(svptrue_b32(), v491, v493); + svfloat32_t zero520 = svdup_n_f32(0); + svfloat32_t v520 = svcmla_f32_x(pred_full, zero520, v1143, v494, 90); + svfloat32_t zero539 = svdup_n_f32(0); + svfloat32_t v539 = svcmla_f32_x(pred_full, zero539, v1144, v497, 90); + svfloat32_t v617 = svmla_f32_x(pred_full, v592, v564, v1150); + svfloat32_t v618 = svnmls_f32_x(pred_full, v592, v564, v1150); + svfloat32_t v619 = svmla_f32_x(pred_full, v616, v556, v1150); + svfloat32_t v620 = svnmls_f32_x(pred_full, v616, v556, v1150); + svfloat32_t v621 = svadd_f32_x(svptrue_b32(), v604, v611); + svfloat32_t v622 = svsub_f32_x(svptrue_b32(), v604, v611); + svfloat32_t zero649 = svdup_n_f32(0); + svfloat32_t v649 = svcmla_f32_x(pred_full, zero649, v1158, v639, 90); + svfloat32_t zero656 = svdup_n_f32(0); + svfloat32_t v656 = svcmla_f32_x(pred_full, zero656, v1158, v640, 90); + svfloat32_t v693 = svmla_f32_x(pred_full, v663, v638, v1159); + svfloat32_t v694 = svmls_f32_x(pred_full, v663, v638, v1159); + svfloat32_t v695 = svadd_f32_x(svptrue_b32(), v675, v692); + svfloat32_t v696 = svsub_f32_x(svptrue_b32(), v675, v692); + svfloat32_t v697 = svmla_f32_x(pred_full, v685, v630, v1159); + svfloat32_t v698 = svnmls_f32_x(pred_full, v685, v630, v1159); + svfloat32_t v545 = svadd_f32_x(svptrue_b32(), v492, v520); + svfloat32_t v546 = svsub_f32_x(svptrue_b32(), v492, v520); + svfloat32_t v547 = svmla_f32_x(pred_full, v484, v498, v1145); + svfloat32_t v548 = svmls_f32_x(pred_full, v484, v498, v1145); + svfloat32_t v549 = svadd_f32_x(svptrue_b32(), v532, v539); + svfloat32_t v550 = svsub_f32_x(svptrue_b32(), v532, v539); + svfloat32_t v623 = svadd_f32_x(svptrue_b32(), v619, v621); + svfloat32_t v624 = svsub_f32_x(svptrue_b32(), v619, v621); + svfloat32_t v625 = svadd_f32_x(svptrue_b32(), v620, v622); + svfloat32_t v626 = svsub_f32_x(svptrue_b32(), v620, v622); + svfloat32_t v699 = svadd_f32_x(svptrue_b32(), v695, v697); + svfloat32_t v700 = svsub_f32_x(svptrue_b32(), v695, v697); + svfloat32_t v701 = svadd_f32_x(svptrue_b32(), v696, v698); + svfloat32_t v702 = svsub_f32_x(svptrue_b32(), v696, v698); + svfloat32_t v703 = svmla_f32_x(pred_full, v495, v567, v1150); + svint16_t v708 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v495, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v811 = svmla_f32_x(pred_full, v496, v568, v1150); + svint16_t v816 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v496, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v551 = svadd_f32_x(svptrue_b32(), v547, v549); + svfloat32_t v552 = svsub_f32_x(svptrue_b32(), v547, v549); + svfloat32_t v553 = svadd_f32_x(svptrue_b32(), v548, v550); + svfloat32_t v554 = svsub_f32_x(svptrue_b32(), v548, v550); + svfloat32_t v704 = svadd_f32_x(svptrue_b32(), v703, v649); + svfloat32_t v705 = svsub_f32_x(svptrue_b32(), v703, v649); + svfloat32_t v757 = svadd_f32_x(svptrue_b32(), v546, v618); + svint16_t v762 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v546, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v812 = svadd_f32_x(svptrue_b32(), v811, v656); + svfloat32_t v813 = svsub_f32_x(svptrue_b32(), v811, v656); + svfloat32_t v865 = svadd_f32_x(svptrue_b32(), v545, v617); + svint16_t v870 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v545, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1169), v1377, + svreinterpret_u64_s16(v708)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1277), v1377, + svreinterpret_u64_s16(v816)); + svint16_t v716 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v705, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v724 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v704, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v730 = svadd_f32_x(svptrue_b32(), v552, v624); + svint16_t v735 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v552, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v758 = svadd_f32_x(svptrue_b32(), v757, v694); + svfloat32_t v759 = svsub_f32_x(svptrue_b32(), v757, v694); + svfloat32_t v784 = svadd_f32_x(svptrue_b32(), v553, v625); + svint16_t v789 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v553, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v824 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v813, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v832 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v812, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v838 = svadd_f32_x(svptrue_b32(), v554, v626); + svint16_t v843 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v554, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v866 = svadd_f32_x(svptrue_b32(), v865, v693); + svfloat32_t v867 = svsub_f32_x(svptrue_b32(), v865, v693); + svfloat32_t v892 = svadd_f32_x(svptrue_b32(), v551, v623); + svint16_t v897 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v551, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1223), v1377, + svreinterpret_u64_s16(v762)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1331), v1377, + svreinterpret_u64_s16(v870)); + svfloat32_t v731 = svadd_f32_x(svptrue_b32(), v730, v700); + svfloat32_t v732 = svsub_f32_x(svptrue_b32(), v730, v700); + svint16_t v770 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v759, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v778 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v758, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v785 = svadd_f32_x(svptrue_b32(), v784, v701); + svfloat32_t v786 = svsub_f32_x(svptrue_b32(), v784, v701); + svfloat32_t v839 = svadd_f32_x(svptrue_b32(), v838, v702); + svfloat32_t v840 = svsub_f32_x(svptrue_b32(), v838, v702); + svint16_t v878 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v867, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v886 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v866, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v893 = svadd_f32_x(svptrue_b32(), v892, v699); + svfloat32_t v894 = svsub_f32_x(svptrue_b32(), v892, v699); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1178), v1377, + svreinterpret_u64_s16(v716)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1187), v1377, + svreinterpret_u64_s16(v724)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1196), v1377, + svreinterpret_u64_s16(v735)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1250), v1377, + svreinterpret_u64_s16(v789)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1286), v1377, + svreinterpret_u64_s16(v824)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1295), v1377, + svreinterpret_u64_s16(v832)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1304), v1377, + svreinterpret_u64_s16(v843)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1358), v1377, + svreinterpret_u64_s16(v897)); + svint16_t v743 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v732, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v751 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v731, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v797 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v786, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v805 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v785, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v851 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v840, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v859 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v839, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v905 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v894, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v913 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v893, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1232), v1377, + svreinterpret_u64_s16(v770)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1241), v1377, + svreinterpret_u64_s16(v778)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1340), v1377, + svreinterpret_u64_s16(v878)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1349), v1377, + svreinterpret_u64_s16(v886)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1205), v1377, + svreinterpret_u64_s16(v743)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1214), v1377, + svreinterpret_u64_s16(v751)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1259), v1377, + svreinterpret_u64_s16(v797)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1268), v1377, + svreinterpret_u64_s16(v805)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1313), v1377, + svreinterpret_u64_s16(v851)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1322), v1377, + svreinterpret_u64_s16(v859)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1367), v1377, + svreinterpret_u64_s16(v905)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1376), v1377, + svreinterpret_u64_s16(v913)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs25(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v92 = v5[istride]; + float v1168 = 9.6858316112863108e-01F; + float v1171 = -2.4868988716485479e-01F; + float v1172 = 2.4868988716485479e-01F; + float v1312 = 8.7630668004386358e-01F; + float v1315 = -4.8175367410171532e-01F; + float v1316 = 4.8175367410171532e-01F; + float v1456 = 7.2896862742141155e-01F; + float v1459 = -6.8454710592868862e-01F; + float v1460 = 6.8454710592868862e-01F; + float v1468 = 6.2790519529313527e-02F; + float v1471 = -9.9802672842827156e-01F; + float v1472 = 9.9802672842827156e-01F; + float v1600 = 5.3582679497899655e-01F; + float v1603 = -8.4432792550201508e-01F; + float v1604 = 8.4432792550201508e-01F; + float v1612 = -4.2577929156507272e-01F; + float v1615 = -9.0482705246601947e-01F; + float v1616 = 9.0482705246601947e-01F; + float v1624 = -6.3742398974868952e-01F; + float v1627 = 7.7051324277578936e-01F; + float v1628 = -7.7051324277578936e-01F; + float v1642 = -9.9211470131447776e-01F; + float v1645 = -1.2533323356430454e-01F; + float v1646 = 1.2533323356430454e-01F; + float v1662 = 2.5000000000000000e-01F; + float v1672 = 5.5901699437494745e-01F; + float v1682 = 6.1803398874989490e-01F; + float v1707 = 9.5105651629515353e-01F; + float v1708 = -9.5105651629515353e-01F; + float32x2_t v1710 = (float32x2_t){v4, v4}; + float v1733 = 2.0000000000000000e+00F; + float32x2_t v98 = vtrn1_f32(v92, v92); + float32x2_t v99 = vtrn2_f32(v92, v92); + float32x2_t v452 = v5[0]; + float32x2_t v1169 = (float32x2_t){v1168, v1168}; + float32x2_t v1173 = (float32x2_t){v1171, v1172}; + float32x2_t v1313 = (float32x2_t){v1312, v1312}; + float32x2_t v1317 = (float32x2_t){v1315, v1316}; + float32x2_t v1457 = (float32x2_t){v1456, v1456}; + float32x2_t v1461 = (float32x2_t){v1459, v1460}; + float32x2_t v1469 = (float32x2_t){v1468, v1468}; + float32x2_t v1473 = (float32x2_t){v1471, v1472}; + float32x2_t v1503 = (float32x2_t){v1628, v1627}; + float32x2_t v1601 = (float32x2_t){v1600, v1600}; + float32x2_t v1605 = (float32x2_t){v1603, v1604}; + float32x2_t v1613 = (float32x2_t){v1612, v1612}; + float32x2_t v1617 = (float32x2_t){v1615, v1616}; + float32x2_t v1625 = (float32x2_t){v1624, v1624}; + float32x2_t v1629 = (float32x2_t){v1627, v1628}; + float32x2_t v1643 = (float32x2_t){v1642, v1642}; + float32x2_t v1647 = (float32x2_t){v1645, v1646}; + float32x2_t v1663 = (float32x2_t){v1662, v1662}; + float32x2_t v1673 = (float32x2_t){v1672, v1672}; + float32x2_t v1683 = (float32x2_t){v1682, v1682}; + float32x2_t v1709 = (float32x2_t){v1707, v1708}; + float32x2_t v1734 = (float32x2_t){v1733, v1733}; + float32x2_t v20 = v5[istride * 5]; + int64_t v24 = 8 + j * 48; + float32x2_t v38 = v5[istride * 10]; + int64_t v42 = 18 + j * 48; + float32x2_t v56 = v5[istride * 15]; + int64_t v60 = 28 + j * 48; + float32x2_t v74 = v5[istride * 20]; + int64_t v78 = 38 + j * 48; + float32x2_t v97 = v7[j * 48]; + int64_t v101 = j * 48 + 1; + float32x2_t v110 = v5[istride * 6]; + int64_t v114 = 10 + j * 48; + float32x2_t v128 = v5[istride * 11]; + int64_t v132 = 20 + j * 48; + float32x2_t v146 = v5[istride * 16]; + int64_t v150 = 30 + j * 48; + float32x2_t v164 = v5[istride * 21]; + int64_t v168 = 40 + j * 48; + float32x2_t v182 = v5[istride * 2]; + int64_t v186 = 2 + j * 48; + float32x2_t v200 = v5[istride * 7]; + int64_t v204 = 12 + j * 48; + float32x2_t v218 = v5[istride * 12]; + int64_t v222 = 22 + j * 48; + float32x2_t v236 = v5[istride * 17]; + int64_t v240 = 32 + j * 48; + float32x2_t v254 = v5[istride * 22]; + int64_t v258 = 42 + j * 48; + float32x2_t v272 = v5[istride * 3]; + int64_t v276 = 4 + j * 48; + float32x2_t v290 = v5[istride * 8]; + int64_t v294 = 14 + j * 48; + float32x2_t v308 = v5[istride * 13]; + int64_t v312 = 24 + j * 48; + float32x2_t v326 = v5[istride * 18]; + int64_t v330 = 34 + j * 48; + float32x2_t v344 = v5[istride * 23]; + int64_t v348 = 44 + j * 48; + float32x2_t v362 = v5[istride * 4]; + int64_t v366 = 6 + j * 48; + float32x2_t v380 = v5[istride * 9]; + int64_t v384 = 16 + j * 48; + float32x2_t v398 = v5[istride * 14]; + int64_t v402 = 26 + j * 48; + float32x2_t v416 = v5[istride * 19]; + int64_t v420 = 36 + j * 48; + float32x2_t v434 = v5[istride * 24]; + int64_t v438 = 46 + j * 48; + float32x2_t v1175 = vmul_f32(v1710, v1173); + float32x2_t v1319 = vmul_f32(v1710, v1317); + float32x2_t v1463 = vmul_f32(v1710, v1461); + float32x2_t v1475 = vmul_f32(v1710, v1473); + float32x2_t v1505 = vmul_f32(v1710, v1503); + float32x2_t v1607 = vmul_f32(v1710, v1605); + float32x2_t v1619 = vmul_f32(v1710, v1617); + float32x2_t v1631 = vmul_f32(v1710, v1629); + float32x2_t v1649 = vmul_f32(v1710, v1647); + float32x2_t v1711 = vmul_f32(v1710, v1709); + float32x2_t v25 = v7[v24]; + float32x2_t v26 = vtrn1_f32(v20, v20); + float32x2_t v27 = vtrn2_f32(v20, v20); + int64_t v29 = v24 + 1; + float32x2_t v43 = v7[v42]; + float32x2_t v44 = vtrn1_f32(v38, v38); + float32x2_t v45 = vtrn2_f32(v38, v38); + int64_t v47 = v42 + 1; + float32x2_t v61 = v7[v60]; + float32x2_t v62 = vtrn1_f32(v56, v56); + float32x2_t v63 = vtrn2_f32(v56, v56); + int64_t v65 = v60 + 1; + float32x2_t v79 = v7[v78]; + float32x2_t v80 = vtrn1_f32(v74, v74); + float32x2_t v81 = vtrn2_f32(v74, v74); + int64_t v83 = v78 + 1; + float32x2_t v102 = v7[v101]; + float32x2_t v103 = vmul_f32(v98, v97); + float32x2_t v115 = v7[v114]; + float32x2_t v116 = vtrn1_f32(v110, v110); + float32x2_t v117 = vtrn2_f32(v110, v110); + int64_t v119 = v114 + 1; + float32x2_t v133 = v7[v132]; + float32x2_t v134 = vtrn1_f32(v128, v128); + float32x2_t v135 = vtrn2_f32(v128, v128); + int64_t v137 = v132 + 1; + float32x2_t v151 = v7[v150]; + float32x2_t v152 = vtrn1_f32(v146, v146); + float32x2_t v153 = vtrn2_f32(v146, v146); + int64_t v155 = v150 + 1; + float32x2_t v169 = v7[v168]; + float32x2_t v170 = vtrn1_f32(v164, v164); + float32x2_t v171 = vtrn2_f32(v164, v164); + int64_t v173 = v168 + 1; + float32x2_t v187 = v7[v186]; + float32x2_t v188 = vtrn1_f32(v182, v182); + float32x2_t v189 = vtrn2_f32(v182, v182); + int64_t v191 = v186 + 1; + float32x2_t v205 = v7[v204]; + float32x2_t v206 = vtrn1_f32(v200, v200); + float32x2_t v207 = vtrn2_f32(v200, v200); + int64_t v209 = v204 + 1; + float32x2_t v223 = v7[v222]; + float32x2_t v224 = vtrn1_f32(v218, v218); + float32x2_t v225 = vtrn2_f32(v218, v218); + int64_t v227 = v222 + 1; + float32x2_t v241 = v7[v240]; + float32x2_t v242 = vtrn1_f32(v236, v236); + float32x2_t v243 = vtrn2_f32(v236, v236); + int64_t v245 = v240 + 1; + float32x2_t v259 = v7[v258]; + float32x2_t v260 = vtrn1_f32(v254, v254); + float32x2_t v261 = vtrn2_f32(v254, v254); + int64_t v263 = v258 + 1; + float32x2_t v277 = v7[v276]; + float32x2_t v278 = vtrn1_f32(v272, v272); + float32x2_t v279 = vtrn2_f32(v272, v272); + int64_t v281 = v276 + 1; + float32x2_t v295 = v7[v294]; + float32x2_t v296 = vtrn1_f32(v290, v290); + float32x2_t v297 = vtrn2_f32(v290, v290); + int64_t v299 = v294 + 1; + float32x2_t v313 = v7[v312]; + float32x2_t v314 = vtrn1_f32(v308, v308); + float32x2_t v315 = vtrn2_f32(v308, v308); + int64_t v317 = v312 + 1; + float32x2_t v331 = v7[v330]; + float32x2_t v332 = vtrn1_f32(v326, v326); + float32x2_t v333 = vtrn2_f32(v326, v326); + int64_t v335 = v330 + 1; + float32x2_t v349 = v7[v348]; + float32x2_t v350 = vtrn1_f32(v344, v344); + float32x2_t v351 = vtrn2_f32(v344, v344); + int64_t v353 = v348 + 1; + float32x2_t v367 = v7[v366]; + float32x2_t v368 = vtrn1_f32(v362, v362); + float32x2_t v369 = vtrn2_f32(v362, v362); + int64_t v371 = v366 + 1; + float32x2_t v385 = v7[v384]; + float32x2_t v386 = vtrn1_f32(v380, v380); + float32x2_t v387 = vtrn2_f32(v380, v380); + int64_t v389 = v384 + 1; + float32x2_t v403 = v7[v402]; + float32x2_t v404 = vtrn1_f32(v398, v398); + float32x2_t v405 = vtrn2_f32(v398, v398); + int64_t v407 = v402 + 1; + float32x2_t v421 = v7[v420]; + float32x2_t v422 = vtrn1_f32(v416, v416); + float32x2_t v423 = vtrn2_f32(v416, v416); + int64_t v425 = v420 + 1; + float32x2_t v439 = v7[v438]; + float32x2_t v440 = vtrn1_f32(v434, v434); + float32x2_t v441 = vtrn2_f32(v434, v434); + int64_t v443 = v438 + 1; + float32x2_t v30 = v7[v29]; + float32x2_t v31 = vmul_f32(v26, v25); + float32x2_t v48 = v7[v47]; + float32x2_t v49 = vmul_f32(v44, v43); + float32x2_t v66 = v7[v65]; + float32x2_t v67 = vmul_f32(v62, v61); + float32x2_t v84 = v7[v83]; + float32x2_t v85 = vmul_f32(v80, v79); + float32x2_t v120 = v7[v119]; + float32x2_t v121 = vmul_f32(v116, v115); + float32x2_t v138 = v7[v137]; + float32x2_t v139 = vmul_f32(v134, v133); + float32x2_t v156 = v7[v155]; + float32x2_t v157 = vmul_f32(v152, v151); + float32x2_t v174 = v7[v173]; + float32x2_t v175 = vmul_f32(v170, v169); + float32x2_t v192 = v7[v191]; + float32x2_t v193 = vmul_f32(v188, v187); + float32x2_t v210 = v7[v209]; + float32x2_t v211 = vmul_f32(v206, v205); + float32x2_t v228 = v7[v227]; + float32x2_t v229 = vmul_f32(v224, v223); + float32x2_t v246 = v7[v245]; + float32x2_t v247 = vmul_f32(v242, v241); + float32x2_t v264 = v7[v263]; + float32x2_t v265 = vmul_f32(v260, v259); + float32x2_t v282 = v7[v281]; + float32x2_t v283 = vmul_f32(v278, v277); + float32x2_t v300 = v7[v299]; + float32x2_t v301 = vmul_f32(v296, v295); + float32x2_t v318 = v7[v317]; + float32x2_t v319 = vmul_f32(v314, v313); + float32x2_t v336 = v7[v335]; + float32x2_t v337 = vmul_f32(v332, v331); + float32x2_t v354 = v7[v353]; + float32x2_t v355 = vmul_f32(v350, v349); + float32x2_t v372 = v7[v371]; + float32x2_t v373 = vmul_f32(v368, v367); + float32x2_t v390 = v7[v389]; + float32x2_t v391 = vmul_f32(v386, v385); + float32x2_t v408 = v7[v407]; + float32x2_t v409 = vmul_f32(v404, v403); + float32x2_t v426 = v7[v425]; + float32x2_t v427 = vmul_f32(v422, v421); + float32x2_t v444 = v7[v443]; + float32x2_t v445 = vmul_f32(v440, v439); + float32x2_t v105 = vfma_f32(v103, v99, v102); + float32x2_t v33 = vfma_f32(v31, v27, v30); + float32x2_t v51 = vfma_f32(v49, v45, v48); + float32x2_t v69 = vfma_f32(v67, v63, v66); + float32x2_t v87 = vfma_f32(v85, v81, v84); + float32x2_t v123 = vfma_f32(v121, v117, v120); + float32x2_t v141 = vfma_f32(v139, v135, v138); + float32x2_t v159 = vfma_f32(v157, v153, v156); + float32x2_t v177 = vfma_f32(v175, v171, v174); + float32x2_t v195 = vfma_f32(v193, v189, v192); + float32x2_t v213 = vfma_f32(v211, v207, v210); + float32x2_t v231 = vfma_f32(v229, v225, v228); + float32x2_t v249 = vfma_f32(v247, v243, v246); + float32x2_t v267 = vfma_f32(v265, v261, v264); + float32x2_t v285 = vfma_f32(v283, v279, v282); + float32x2_t v303 = vfma_f32(v301, v297, v300); + float32x2_t v321 = vfma_f32(v319, v315, v318); + float32x2_t v339 = vfma_f32(v337, v333, v336); + float32x2_t v357 = vfma_f32(v355, v351, v354); + float32x2_t v375 = vfma_f32(v373, v369, v372); + float32x2_t v393 = vfma_f32(v391, v387, v390); + float32x2_t v411 = vfma_f32(v409, v405, v408); + float32x2_t v429 = vfma_f32(v427, v423, v426); + float32x2_t v447 = vfma_f32(v445, v441, v444); + float32x2_t v489 = vsub_f32(v33, v87); + float32x2_t v493 = vmul_f32(v33, v1734); + float32x2_t v507 = vsub_f32(v51, v69); + float32x2_t v511 = vmul_f32(v51, v1734); + float32x2_t v603 = vsub_f32(v123, v177); + float32x2_t v607 = vmul_f32(v123, v1734); + float32x2_t v621 = vsub_f32(v141, v159); + float32x2_t v625 = vmul_f32(v141, v1734); + float32x2_t v717 = vsub_f32(v213, v267); + float32x2_t v721 = vmul_f32(v213, v1734); + float32x2_t v735 = vsub_f32(v231, v249); + float32x2_t v739 = vmul_f32(v231, v1734); + float32x2_t v831 = vsub_f32(v303, v357); + float32x2_t v835 = vmul_f32(v303, v1734); + float32x2_t v849 = vsub_f32(v321, v339); + float32x2_t v853 = vmul_f32(v321, v1734); + float32x2_t v945 = vsub_f32(v393, v447); + float32x2_t v949 = vmul_f32(v393, v1734); + float32x2_t v963 = vsub_f32(v411, v429); + float32x2_t v967 = vmul_f32(v411, v1734); + float32x2_t v494 = vsub_f32(v493, v489); + float32x2_t v512 = vsub_f32(v511, v507); + float32x2_t v523 = vmul_f32(v507, v1683); + float32x2_t v538 = vmul_f32(v489, v1683); + float32x2_t v608 = vsub_f32(v607, v603); + float32x2_t v626 = vsub_f32(v625, v621); + float32x2_t v637 = vmul_f32(v621, v1683); + float32x2_t v652 = vmul_f32(v603, v1683); + float32x2_t v722 = vsub_f32(v721, v717); + float32x2_t v740 = vsub_f32(v739, v735); + float32x2_t v751 = vmul_f32(v735, v1683); + float32x2_t v766 = vmul_f32(v717, v1683); + float32x2_t v836 = vsub_f32(v835, v831); + float32x2_t v854 = vsub_f32(v853, v849); + float32x2_t v865 = vmul_f32(v849, v1683); + float32x2_t v880 = vmul_f32(v831, v1683); + float32x2_t v950 = vsub_f32(v949, v945); + float32x2_t v968 = vsub_f32(v967, v963); + float32x2_t v979 = vmul_f32(v963, v1683); + float32x2_t v994 = vmul_f32(v945, v1683); + float32x2_t v513 = vadd_f32(v494, v512); + float32x2_t v514 = vsub_f32(v494, v512); + float32x2_t v524 = vadd_f32(v489, v523); + float32x2_t v539 = vsub_f32(v538, v507); + float32x2_t v627 = vadd_f32(v608, v626); + float32x2_t v628 = vsub_f32(v608, v626); + float32x2_t v638 = vadd_f32(v603, v637); + float32x2_t v653 = vsub_f32(v652, v621); + float32x2_t v741 = vadd_f32(v722, v740); + float32x2_t v742 = vsub_f32(v722, v740); + float32x2_t v752 = vadd_f32(v717, v751); + float32x2_t v767 = vsub_f32(v766, v735); + float32x2_t v855 = vadd_f32(v836, v854); + float32x2_t v856 = vsub_f32(v836, v854); + float32x2_t v866 = vadd_f32(v831, v865); + float32x2_t v881 = vsub_f32(v880, v849); + float32x2_t v969 = vadd_f32(v950, v968); + float32x2_t v970 = vsub_f32(v950, v968); + float32x2_t v980 = vadd_f32(v945, v979); + float32x2_t v995 = vsub_f32(v994, v963); + float32x2_t v518 = vmul_f32(v513, v1663); + float32x2_t v528 = vmul_f32(v514, v1673); + float32x2_t v540 = vadd_f32(v452, v513); + float32x2_t v546 = vrev64_f32(v524); + float32x2_t v554 = vrev64_f32(v539); + float32x2_t v632 = vmul_f32(v627, v1663); + float32x2_t v642 = vmul_f32(v628, v1673); + float32x2_t v654 = vadd_f32(v105, v627); + float32x2_t v660 = vrev64_f32(v638); + float32x2_t v668 = vrev64_f32(v653); + float32x2_t v746 = vmul_f32(v741, v1663); + float32x2_t v756 = vmul_f32(v742, v1673); + float32x2_t v768 = vadd_f32(v195, v741); + float32x2_t v774 = vrev64_f32(v752); + float32x2_t v782 = vrev64_f32(v767); + float32x2_t v860 = vmul_f32(v855, v1663); + float32x2_t v870 = vmul_f32(v856, v1673); + float32x2_t v882 = vadd_f32(v285, v855); + float32x2_t v888 = vrev64_f32(v866); + float32x2_t v896 = vrev64_f32(v881); + float32x2_t v974 = vmul_f32(v969, v1663); + float32x2_t v984 = vmul_f32(v970, v1673); + float32x2_t v996 = vadd_f32(v375, v969); + float32x2_t v1002 = vrev64_f32(v980); + float32x2_t v1010 = vrev64_f32(v995); + float32x2_t v519 = vsub_f32(v452, v518); + float32x2_t v547 = vmul_f32(v546, v1711); + float32x2_t v555 = vmul_f32(v554, v1711); + float32x2_t v633 = vsub_f32(v105, v632); + float32x2_t v661 = vmul_f32(v660, v1711); + float32x2_t v669 = vmul_f32(v668, v1711); + float32x2_t v747 = vsub_f32(v195, v746); + float32x2_t v775 = vmul_f32(v774, v1711); + float32x2_t v783 = vmul_f32(v782, v1711); + float32x2_t v861 = vsub_f32(v285, v860); + float32x2_t v889 = vmul_f32(v888, v1711); + float32x2_t v897 = vmul_f32(v896, v1711); + float32x2_t v975 = vsub_f32(v375, v974); + float32x2_t v1003 = vmul_f32(v1002, v1711); + float32x2_t v1011 = vmul_f32(v1010, v1711); + float32x2_t v1059 = vsub_f32(v654, v996); + float32x2_t v1063 = vmul_f32(v654, v1734); + float32x2_t v1077 = vsub_f32(v768, v882); + float32x2_t v1081 = vmul_f32(v768, v1734); + float32x2_t v529 = vsub_f32(v519, v528); + float32x2_t v533 = vmul_f32(v519, v1734); + float32x2_t v643 = vsub_f32(v633, v642); + float32x2_t v647 = vmul_f32(v633, v1734); + float32x2_t v757 = vsub_f32(v747, v756); + float32x2_t v761 = vmul_f32(v747, v1734); + float32x2_t v871 = vsub_f32(v861, v870); + float32x2_t v875 = vmul_f32(v861, v1734); + float32x2_t v985 = vsub_f32(v975, v984); + float32x2_t v989 = vmul_f32(v975, v1734); + float32x2_t v1064 = vsub_f32(v1063, v1059); + float32x2_t v1082 = vsub_f32(v1081, v1077); + float32x2_t v1093 = vmul_f32(v1077, v1683); + float32x2_t v1108 = vmul_f32(v1059, v1683); + float32x2_t v534 = vsub_f32(v533, v529); + float32x2_t v556 = vsub_f32(v529, v555); + float32x2_t v560 = vmul_f32(v529, v1734); + float32x2_t v648 = vsub_f32(v647, v643); + float32x2_t v670 = vsub_f32(v643, v669); + float32x2_t v674 = vmul_f32(v643, v1734); + float32x2_t v762 = vsub_f32(v761, v757); + float32x2_t v784 = vsub_f32(v757, v783); + float32x2_t v788 = vmul_f32(v757, v1734); + float32x2_t v876 = vsub_f32(v875, v871); + float32x2_t v898 = vsub_f32(v871, v897); + float32x2_t v902 = vmul_f32(v871, v1734); + float32x2_t v990 = vsub_f32(v989, v985); + float32x2_t v1012 = vsub_f32(v985, v1011); + float32x2_t v1016 = vmul_f32(v985, v1734); + float32x2_t v1083 = vadd_f32(v1064, v1082); + float32x2_t v1084 = vsub_f32(v1064, v1082); + float32x2_t v1094 = vadd_f32(v1059, v1093); + float32x2_t v1109 = vsub_f32(v1108, v1077); + float32x2_t v548 = vsub_f32(v534, v547); + float32x2_t v561 = vsub_f32(v560, v556); + float32x2_t v565 = vmul_f32(v534, v1734); + float32x2_t v662 = vsub_f32(v648, v661); + float32x2_t v675 = vsub_f32(v674, v670); + float32x2_t v679 = vmul_f32(v648, v1734); + float32x2_t v776 = vsub_f32(v762, v775); + float32x2_t v789 = vsub_f32(v788, v784); + float32x2_t v793 = vmul_f32(v762, v1734); + float32x2_t v890 = vsub_f32(v876, v889); + float32x2_t v903 = vsub_f32(v902, v898); + float32x2_t v907 = vmul_f32(v876, v1734); + float32x2_t v1004 = vsub_f32(v990, v1003); + float32x2_t v1017 = vsub_f32(v1016, v1012); + float32x2_t v1021 = vmul_f32(v990, v1734); + float32x2_t v1088 = vmul_f32(v1083, v1663); + float32x2_t v1098 = vmul_f32(v1084, v1673); + float32x2_t v1110 = vadd_f32(v540, v1083); + float32x2_t v1122 = vrev64_f32(v1094); + float32x2_t v1136 = vrev64_f32(v1109); + float32x2_t v1320 = vrev64_f32(v670); + float32x2_t v1332 = vrev64_f32(v784); + float32x2_t v1344 = vrev64_f32(v1012); + float32x2_t v1362 = vrev64_f32(v898); + float32x2_t v566 = vsub_f32(v565, v548); + float32x2_t v680 = vsub_f32(v679, v662); + float32x2_t v794 = vsub_f32(v793, v776); + float32x2_t v908 = vsub_f32(v907, v890); + float32x2_t v1022 = vsub_f32(v1021, v1004); + float32x2_t v1089 = vsub_f32(v540, v1088); + int16x4_t v1113 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1110, 15), (int32x2_t){0, 0})); + float32x2_t v1123 = vmul_f32(v1122, v1711); + float32x2_t v1137 = vmul_f32(v1136, v1711); + float32x2_t v1176 = vrev64_f32(v662); + float32x2_t v1188 = vrev64_f32(v776); + float32x2_t v1200 = vrev64_f32(v1004); + float32x2_t v1218 = vrev64_f32(v890); + float32x2_t v1321 = vmul_f32(v1320, v1319); + float32x2_t v1333 = vmul_f32(v1332, v1607); + float32x2_t v1345 = vmul_f32(v1344, v1619); + float32x2_t v1363 = vmul_f32(v1362, v1475); + float32x2_t v1464 = vrev64_f32(v675); + float32x2_t v1476 = vrev64_f32(v789); + float32x2_t v1488 = vrev64_f32(v1017); + float32x2_t v1506 = vrev64_f32(v903); + float32x2_t v1099 = vsub_f32(v1089, v1098); + float32x2_t v1103 = vmul_f32(v1089, v1734); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v1113), 0); + float32x2_t v1177 = vmul_f32(v1176, v1175); + float32x2_t v1189 = vmul_f32(v1188, v1319); + float32x2_t v1201 = vmul_f32(v1200, v1607); + float32x2_t v1219 = vmul_f32(v1218, v1463); + float32x2_t v1322 = vfma_f32(v1321, v670, v1313); + float32x2_t v1334 = vfma_f32(v1333, v784, v1601); + float32x2_t v1346 = vfma_f32(v1345, v1012, v1613); + float32x2_t v1364 = vfma_f32(v1363, v898, v1469); + float32x2_t v1465 = vmul_f32(v1464, v1463); + float32x2_t v1477 = vmul_f32(v1476, v1475); + float32x2_t v1489 = vmul_f32(v1488, v1649); + float32x2_t v1507 = vmul_f32(v1506, v1505); + float32x2_t v1608 = vrev64_f32(v680); + float32x2_t v1620 = vrev64_f32(v794); + float32x2_t v1632 = vrev64_f32(v1022); + float32x2_t v1650 = vrev64_f32(v908); + float32x2_t v1104 = vsub_f32(v1103, v1099); + float32x2_t v1138 = vsub_f32(v1099, v1137); + float32x2_t v1148 = vmul_f32(v1099, v1734); + float32x2_t v1178 = vfma_f32(v1177, v662, v1169); + float32x2_t v1190 = vfma_f32(v1189, v776, v1313); + float32x2_t v1202 = vfma_f32(v1201, v1004, v1601); + float32x2_t v1220 = vfma_f32(v1219, v890, v1457); + float32x2_t v1347 = vsub_f32(v1322, v1346); + float32x2_t v1351 = vmul_f32(v1322, v1734); + float32x2_t v1365 = vsub_f32(v1334, v1364); + float32x2_t v1369 = vmul_f32(v1334, v1734); + float32x2_t v1466 = vfma_f32(v1465, v675, v1457); + float32x2_t v1478 = vfma_f32(v1477, v789, v1469); + float32x2_t v1490 = vfma_f32(v1489, v1017, v1643); + float32x2_t v1508 = vfma_f32(v1507, v903, v1625); + float32x2_t v1609 = vmul_f32(v1608, v1607); + float32x2_t v1621 = vmul_f32(v1620, v1619); + float32x2_t v1633 = vmul_f32(v1632, v1631); + float32x2_t v1651 = vmul_f32(v1650, v1649); + float32x2_t v1124 = vsub_f32(v1104, v1123); + int16x4_t v1141 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1138, 15), (int32x2_t){0, 0})); + float32x2_t v1149 = vsub_f32(v1148, v1138); + float32x2_t v1159 = vmul_f32(v1104, v1734); + float32x2_t v1203 = vsub_f32(v1178, v1202); + float32x2_t v1207 = vmul_f32(v1178, v1734); + float32x2_t v1221 = vsub_f32(v1190, v1220); + float32x2_t v1225 = vmul_f32(v1190, v1734); + float32x2_t v1352 = vsub_f32(v1351, v1347); + float32x2_t v1370 = vsub_f32(v1369, v1365); + float32x2_t v1381 = vmul_f32(v1365, v1683); + float32x2_t v1396 = vmul_f32(v1347, v1683); + float32x2_t v1491 = vsub_f32(v1466, v1490); + float32x2_t v1495 = vmul_f32(v1466, v1734); + float32x2_t v1509 = vsub_f32(v1478, v1508); + float32x2_t v1513 = vmul_f32(v1478, v1734); + float32x2_t v1610 = vfma_f32(v1609, v680, v1601); + float32x2_t v1622 = vfma_f32(v1621, v794, v1613); + float32x2_t v1634 = vfma_f32(v1633, v1022, v1625); + float32x2_t v1652 = vfma_f32(v1651, v908, v1643); + int16x4_t v1127 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1124, 15), (int32x2_t){0, 0})); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v1141), 0); + int16x4_t v1152 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1149, 15), (int32x2_t){0, 0})); + float32x2_t v1160 = vsub_f32(v1159, v1124); + float32x2_t v1208 = vsub_f32(v1207, v1203); + float32x2_t v1226 = vsub_f32(v1225, v1221); + float32x2_t v1237 = vmul_f32(v1221, v1683); + float32x2_t v1252 = vmul_f32(v1203, v1683); + float32x2_t v1371 = vadd_f32(v1352, v1370); + float32x2_t v1372 = vsub_f32(v1352, v1370); + float32x2_t v1382 = vadd_f32(v1347, v1381); + float32x2_t v1397 = vsub_f32(v1396, v1365); + float32x2_t v1496 = vsub_f32(v1495, v1491); + float32x2_t v1514 = vsub_f32(v1513, v1509); + float32x2_t v1525 = vmul_f32(v1509, v1683); + float32x2_t v1540 = vmul_f32(v1491, v1683); + float32x2_t v1635 = vsub_f32(v1610, v1634); + float32x2_t v1639 = vmul_f32(v1610, v1734); + float32x2_t v1653 = vsub_f32(v1622, v1652); + float32x2_t v1657 = vmul_f32(v1622, v1734); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v1127), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v1152), 0); + int16x4_t v1163 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1160, 15), (int32x2_t){0, 0})); + float32x2_t v1227 = vadd_f32(v1208, v1226); + float32x2_t v1228 = vsub_f32(v1208, v1226); + float32x2_t v1238 = vadd_f32(v1203, v1237); + float32x2_t v1253 = vsub_f32(v1252, v1221); + float32x2_t v1376 = vmul_f32(v1371, v1663); + float32x2_t v1386 = vmul_f32(v1372, v1673); + float32x2_t v1398 = vadd_f32(v556, v1371); + float32x2_t v1410 = vrev64_f32(v1382); + float32x2_t v1424 = vrev64_f32(v1397); + float32x2_t v1515 = vadd_f32(v1496, v1514); + float32x2_t v1516 = vsub_f32(v1496, v1514); + float32x2_t v1526 = vadd_f32(v1491, v1525); + float32x2_t v1541 = vsub_f32(v1540, v1509); + float32x2_t v1640 = vsub_f32(v1639, v1635); + float32x2_t v1658 = vsub_f32(v1657, v1653); + float32x2_t v1669 = vmul_f32(v1653, v1683); + float32x2_t v1684 = vmul_f32(v1635, v1683); + v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v1163), 0); + float32x2_t v1232 = vmul_f32(v1227, v1663); + float32x2_t v1242 = vmul_f32(v1228, v1673); + float32x2_t v1254 = vadd_f32(v548, v1227); + float32x2_t v1266 = vrev64_f32(v1238); + float32x2_t v1280 = vrev64_f32(v1253); + float32x2_t v1377 = vsub_f32(v556, v1376); + int16x4_t v1401 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1398, 15), (int32x2_t){0, 0})); + float32x2_t v1411 = vmul_f32(v1410, v1711); + float32x2_t v1425 = vmul_f32(v1424, v1711); + float32x2_t v1520 = vmul_f32(v1515, v1663); + float32x2_t v1530 = vmul_f32(v1516, v1673); + float32x2_t v1542 = vadd_f32(v561, v1515); + float32x2_t v1554 = vrev64_f32(v1526); + float32x2_t v1568 = vrev64_f32(v1541); + float32x2_t v1659 = vadd_f32(v1640, v1658); + float32x2_t v1660 = vsub_f32(v1640, v1658); + float32x2_t v1670 = vadd_f32(v1635, v1669); + float32x2_t v1685 = vsub_f32(v1684, v1653); + float32x2_t v1233 = vsub_f32(v548, v1232); + int16x4_t v1257 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1254, 15), (int32x2_t){0, 0})); + float32x2_t v1267 = vmul_f32(v1266, v1711); + float32x2_t v1281 = vmul_f32(v1280, v1711); + float32x2_t v1387 = vsub_f32(v1377, v1386); + float32x2_t v1391 = vmul_f32(v1377, v1734); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v1401), 0); + float32x2_t v1521 = vsub_f32(v561, v1520); + int16x4_t v1545 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1542, 15), (int32x2_t){0, 0})); + float32x2_t v1555 = vmul_f32(v1554, v1711); + float32x2_t v1569 = vmul_f32(v1568, v1711); + float32x2_t v1664 = vmul_f32(v1659, v1663); + float32x2_t v1674 = vmul_f32(v1660, v1673); + float32x2_t v1686 = vadd_f32(v566, v1659); + float32x2_t v1698 = vrev64_f32(v1670); + float32x2_t v1712 = vrev64_f32(v1685); + float32x2_t v1243 = vsub_f32(v1233, v1242); + float32x2_t v1247 = vmul_f32(v1233, v1734); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v1257), 0); + float32x2_t v1392 = vsub_f32(v1391, v1387); + float32x2_t v1426 = vsub_f32(v1387, v1425); + float32x2_t v1436 = vmul_f32(v1387, v1734); + float32x2_t v1531 = vsub_f32(v1521, v1530); + float32x2_t v1535 = vmul_f32(v1521, v1734); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v1545), 0); + float32x2_t v1665 = vsub_f32(v566, v1664); + int16x4_t v1689 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1686, 15), (int32x2_t){0, 0})); + float32x2_t v1699 = vmul_f32(v1698, v1711); + float32x2_t v1713 = vmul_f32(v1712, v1711); + float32x2_t v1248 = vsub_f32(v1247, v1243); + float32x2_t v1282 = vsub_f32(v1243, v1281); + float32x2_t v1292 = vmul_f32(v1243, v1734); + float32x2_t v1412 = vsub_f32(v1392, v1411); + int16x4_t v1429 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1426, 15), (int32x2_t){0, 0})); + float32x2_t v1437 = vsub_f32(v1436, v1426); + float32x2_t v1447 = vmul_f32(v1392, v1734); + float32x2_t v1536 = vsub_f32(v1535, v1531); + float32x2_t v1570 = vsub_f32(v1531, v1569); + float32x2_t v1580 = vmul_f32(v1531, v1734); + float32x2_t v1675 = vsub_f32(v1665, v1674); + float32x2_t v1679 = vmul_f32(v1665, v1734); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v1689), 0); + float32x2_t v1268 = vsub_f32(v1248, v1267); + int16x4_t v1285 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1282, 15), (int32x2_t){0, 0})); + float32x2_t v1293 = vsub_f32(v1292, v1282); + float32x2_t v1303 = vmul_f32(v1248, v1734); + int16x4_t v1415 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1412, 15), (int32x2_t){0, 0})); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v1429), 0); + int16x4_t v1440 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1437, 15), (int32x2_t){0, 0})); + float32x2_t v1448 = vsub_f32(v1447, v1412); + float32x2_t v1556 = vsub_f32(v1536, v1555); + int16x4_t v1573 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1570, 15), (int32x2_t){0, 0})); + float32x2_t v1581 = vsub_f32(v1580, v1570); + float32x2_t v1591 = vmul_f32(v1536, v1734); + float32x2_t v1680 = vsub_f32(v1679, v1675); + float32x2_t v1714 = vsub_f32(v1675, v1713); + float32x2_t v1724 = vmul_f32(v1675, v1734); + int16x4_t v1271 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1268, 15), (int32x2_t){0, 0})); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v1285), 0); + int16x4_t v1296 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1293, 15), (int32x2_t){0, 0})); + float32x2_t v1304 = vsub_f32(v1303, v1268); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v1415), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v1440), 0); + int16x4_t v1451 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1448, 15), (int32x2_t){0, 0})); + int16x4_t v1559 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1556, 15), (int32x2_t){0, 0})); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v1573), 0); + int16x4_t v1584 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1581, 15), (int32x2_t){0, 0})); + float32x2_t v1592 = vsub_f32(v1591, v1556); + float32x2_t v1700 = vsub_f32(v1680, v1699); + int16x4_t v1717 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1714, 15), (int32x2_t){0, 0})); + float32x2_t v1725 = vsub_f32(v1724, v1714); + float32x2_t v1735 = vmul_f32(v1680, v1734); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v1271), 0); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v1296), 0); + int16x4_t v1307 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1304, 15), (int32x2_t){0, 0})); + v6[ostride * 22] = vget_lane_s32(vreinterpret_s32_s16(v1451), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v1559), 0); + v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v1584), 0); + int16x4_t v1595 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1592, 15), (int32x2_t){0, 0})); + int16x4_t v1703 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1700, 15), (int32x2_t){0, 0})); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v1717), 0); + int16x4_t v1728 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1725, 15), (int32x2_t){0, 0})); + float32x2_t v1736 = vsub_f32(v1735, v1700); + v6[ostride * 21] = vget_lane_s32(vreinterpret_s32_s16(v1307), 0); + v6[ostride * 23] = vget_lane_s32(vreinterpret_s32_s16(v1595), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v1703), 0); + v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v1728), 0); + int16x4_t v1739 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1736, 15), (int32x2_t){0, 0})); + v6[ostride * 24] = vget_lane_s32(vreinterpret_s32_s16(v1739), 0); + v5 += 1 * idist; + v6 += 1 * odist; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs25(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + int64_t v3 = odist; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * v3; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + int64_t v13 = j; + float v1164 = 9.6858316112863108e-01F; + float v1169 = 2.4868988716485479e-01F; + float v1331 = 8.7630668004386358e-01F; + float v1336 = 4.8175367410171532e-01F; + float v1498 = 7.2896862742141155e-01F; + float v1503 = 6.8454710592868862e-01F; + float v1511 = 6.2790519529313527e-02F; + float v1516 = 9.9802672842827156e-01F; + float v1549 = 7.7051324277578925e-01F; + float v1665 = 5.3582679497899655e-01F; + float v1670 = 8.4432792550201508e-01F; + float v1678 = -4.2577929156507272e-01F; + float v1683 = 9.0482705246601947e-01F; + float v1691 = -6.3742398974868952e-01F; + float v1696 = -7.7051324277578936e-01F; + float v1711 = -9.9211470131447776e-01F; + float v1716 = 1.2533323356430454e-01F; + float v1733 = 2.5000000000000000e-01F; + float v1745 = 5.5901699437494745e-01F; + float v1757 = 6.1803398874989490e-01F; + float v1788 = -9.5105651629515353e-01F; + float v1818 = 2.0000000000000000e+00F; + const float32x2_t *v1873 = &v5[v0]; + int32_t *v2239 = &v6[v2]; + int64_t v19 = v0 * 5; + int64_t v27 = v10 * 4; + int64_t v33 = v0 * 10; + int64_t v41 = v10 * 9; + int64_t v47 = v0 * 15; + int64_t v55 = v10 * 14; + int64_t v61 = v0 * 20; + int64_t v69 = v10 * 19; + int64_t v89 = v0 * 6; + int64_t v97 = v10 * 5; + int64_t v103 = v0 * 11; + int64_t v111 = v10 * 10; + int64_t v117 = v0 * 16; + int64_t v125 = v10 * 15; + int64_t v131 = v0 * 21; + int64_t v139 = v10 * 20; + int64_t v145 = v0 * 2; + int64_t v159 = v0 * 7; + int64_t v167 = v10 * 6; + int64_t v173 = v0 * 12; + int64_t v181 = v10 * 11; + int64_t v187 = v0 * 17; + int64_t v195 = v10 * 16; + int64_t v201 = v0 * 22; + int64_t v209 = v10 * 21; + int64_t v215 = v0 * 3; + int64_t v223 = v10 * 2; + int64_t v229 = v0 * 8; + int64_t v237 = v10 * 7; + int64_t v243 = v0 * 13; + int64_t v251 = v10 * 12; + int64_t v257 = v0 * 18; + int64_t v265 = v10 * 17; + int64_t v271 = v0 * 23; + int64_t v279 = v10 * 22; + int64_t v285 = v0 * 4; + int64_t v293 = v10 * 3; + int64_t v299 = v0 * 9; + int64_t v307 = v10 * 8; + int64_t v313 = v0 * 14; + int64_t v321 = v10 * 13; + int64_t v327 = v0 * 19; + int64_t v335 = v10 * 18; + int64_t v341 = v0 * 24; + int64_t v349 = v10 * 23; + int64_t v350 = v13 * 24; + int64_t v1112 = v2 * 5; + int64_t v1128 = v2 * 10; + int64_t v1142 = v2 * 15; + int64_t v1156 = v2 * 20; + float v1172 = v4 * v1169; + int64_t v1279 = v2 * 6; + int64_t v1295 = v2 * 11; + int64_t v1309 = v2 * 16; + int64_t v1323 = v2 * 21; + float v1339 = v4 * v1336; + int64_t v1430 = v2 * 2; + int64_t v1446 = v2 * 7; + int64_t v1462 = v2 * 12; + int64_t v1476 = v2 * 17; + int64_t v1490 = v2 * 22; + float v1506 = v4 * v1503; + float v1519 = v4 * v1516; + float v1552 = v4 * v1549; + int64_t v1597 = v2 * 3; + int64_t v1613 = v2 * 8; + int64_t v1629 = v2 * 13; + int64_t v1643 = v2 * 18; + int64_t v1657 = v2 * 23; + float v1673 = v4 * v1670; + float v1686 = v4 * v1683; + float v1699 = v4 * v1696; + float v1719 = v4 * v1716; + int64_t v1764 = v2 * 4; + int64_t v1780 = v2 * 9; + float v1791 = v4 * v1788; + int64_t v1796 = v2 * 14; + int64_t v1810 = v2 * 19; + int64_t v1824 = v2 * 24; + const float32x2_t *v2055 = &v5[0]; + svint64_t v2056 = svindex_s64(0, v1); + svfloat32_t v2161 = svdup_n_f32(0); + int32_t *v2175 = &v6[0]; + svfloat32_t v2218 = svdup_n_f32(v1164); + svfloat32_t v2282 = svdup_n_f32(v1331); + svfloat32_t v2346 = svdup_n_f32(v1498); + svfloat32_t v2348 = svdup_n_f32(v1511); + svfloat32_t v2410 = svdup_n_f32(v1665); + svfloat32_t v2412 = svdup_n_f32(v1678); + svfloat32_t v2414 = svdup_n_f32(v1691); + svfloat32_t v2417 = svdup_n_f32(v1711); + svfloat32_t v2420 = svdup_n_f32(v1733); + svfloat32_t v2422 = svdup_n_f32(v1745); + svfloat32_t v2424 = svdup_n_f32(v1757); + svfloat32_t v2464 = svdup_n_f32(v1818); + svint64_t v2472 = svindex_s64(0, v3); + int64_t v29 = v27 + v350; + int64_t v43 = v41 + v350; + int64_t v57 = v55 + v350; + int64_t v71 = v69 + v350; + svfloat32_t v86 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v350])); + int64_t v99 = v97 + v350; + int64_t v113 = v111 + v350; + int64_t v127 = v125 + v350; + int64_t v141 = v139 + v350; + int64_t v155 = v10 + v350; + int64_t v169 = v167 + v350; + int64_t v183 = v181 + v350; + int64_t v197 = v195 + v350; + int64_t v211 = v209 + v350; + int64_t v225 = v223 + v350; + int64_t v239 = v237 + v350; + int64_t v253 = v251 + v350; + int64_t v267 = v265 + v350; + int64_t v281 = v279 + v350; + int64_t v295 = v293 + v350; + int64_t v309 = v307 + v350; + int64_t v323 = v321 + v350; + int64_t v337 = v335 + v350; + int64_t v351 = v349 + v350; + const float32x2_t *v1837 = &v5[v19]; + const float32x2_t *v1846 = &v5[v33]; + const float32x2_t *v1855 = &v5[v47]; + const float32x2_t *v1864 = &v5[v61]; + svfloat32_t v1875 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1873), v2056)); + const float32x2_t *v1883 = &v5[v89]; + const float32x2_t *v1892 = &v5[v103]; + const float32x2_t *v1901 = &v5[v117]; + const float32x2_t *v1910 = &v5[v131]; + const float32x2_t *v1919 = &v5[v145]; + const float32x2_t *v1928 = &v5[v159]; + const float32x2_t *v1937 = &v5[v173]; + const float32x2_t *v1946 = &v5[v187]; + const float32x2_t *v1955 = &v5[v201]; + const float32x2_t *v1964 = &v5[v215]; + const float32x2_t *v1973 = &v5[v229]; + const float32x2_t *v1982 = &v5[v243]; + const float32x2_t *v1991 = &v5[v257]; + const float32x2_t *v2000 = &v5[v271]; + const float32x2_t *v2009 = &v5[v285]; + const float32x2_t *v2018 = &v5[v299]; + const float32x2_t *v2027 = &v5[v313]; + const float32x2_t *v2036 = &v5[v327]; + const float32x2_t *v2045 = &v5[v341]; + svfloat32_t v2057 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v2055), v2056)); + int32_t *v2185 = &v6[v1112]; + int32_t *v2195 = &v6[v1128]; + int32_t *v2205 = &v6[v1142]; + int32_t *v2215 = &v6[v1156]; + svfloat32_t v2219 = svdup_n_f32(v1172); + int32_t *v2249 = &v6[v1279]; + int32_t *v2259 = &v6[v1295]; + int32_t *v2269 = &v6[v1309]; + int32_t *v2279 = &v6[v1323]; + svfloat32_t v2283 = svdup_n_f32(v1339); + int32_t *v2303 = &v6[v1430]; + int32_t *v2313 = &v6[v1446]; + int32_t *v2323 = &v6[v1462]; + int32_t *v2333 = &v6[v1476]; + int32_t *v2343 = &v6[v1490]; + svfloat32_t v2347 = svdup_n_f32(v1506); + svfloat32_t v2349 = svdup_n_f32(v1519); + svfloat32_t v2354 = svdup_n_f32(v1552); + int32_t *v2367 = &v6[v1597]; + int32_t *v2377 = &v6[v1613]; + int32_t *v2387 = &v6[v1629]; + int32_t *v2397 = &v6[v1643]; + int32_t *v2407 = &v6[v1657]; + svfloat32_t v2411 = svdup_n_f32(v1673); + svfloat32_t v2413 = svdup_n_f32(v1686); + svfloat32_t v2415 = svdup_n_f32(v1699); + svfloat32_t v2418 = svdup_n_f32(v1719); + int32_t *v2431 = &v6[v1764]; + int32_t *v2441 = &v6[v1780]; + svfloat32_t v2444 = svdup_n_f32(v1791); + int32_t *v2451 = &v6[v1796]; + int32_t *v2461 = &v6[v1810]; + int32_t *v2471 = &v6[v1824]; + svfloat32_t v30 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v29])); + svfloat32_t v44 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v43])); + svfloat32_t v58 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v57])); + svfloat32_t v72 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v71])); + svfloat32_t zero87 = svdup_n_f32(0); + svfloat32_t v87 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero87, v1875, v86, 0), + v1875, v86, 90); + svfloat32_t v100 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v99])); + svfloat32_t v114 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v113])); + svfloat32_t v128 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v127])); + svfloat32_t v142 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v141])); + svfloat32_t v156 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v155])); + svfloat32_t v170 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v169])); + svfloat32_t v184 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v183])); + svfloat32_t v198 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v197])); + svfloat32_t v212 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v211])); + svfloat32_t v226 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v225])); + svfloat32_t v240 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v239])); + svfloat32_t v254 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v253])); + svfloat32_t v268 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v267])); + svfloat32_t v282 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v281])); + svfloat32_t v296 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v295])); + svfloat32_t v310 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v309])); + svfloat32_t v324 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v323])); + svfloat32_t v338 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v337])); + svfloat32_t v352 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v351])); + svfloat32_t v1839 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1837), v2056)); + svfloat32_t v1848 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1846), v2056)); + svfloat32_t v1857 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1855), v2056)); + svfloat32_t v1866 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1864), v2056)); + svfloat32_t v1885 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1883), v2056)); + svfloat32_t v1894 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1892), v2056)); + svfloat32_t v1903 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1901), v2056)); + svfloat32_t v1912 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1910), v2056)); + svfloat32_t v1921 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1919), v2056)); + svfloat32_t v1930 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1928), v2056)); + svfloat32_t v1939 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1937), v2056)); + svfloat32_t v1948 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1946), v2056)); + svfloat32_t v1957 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1955), v2056)); + svfloat32_t v1966 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1964), v2056)); + svfloat32_t v1975 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1973), v2056)); + svfloat32_t v1984 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1982), v2056)); + svfloat32_t v1993 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1991), v2056)); + svfloat32_t v2002 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v2000), v2056)); + svfloat32_t v2011 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v2009), v2056)); + svfloat32_t v2020 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v2018), v2056)); + svfloat32_t v2029 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v2027), v2056)); + svfloat32_t v2038 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v2036), v2056)); + svfloat32_t v2047 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v2045), v2056)); + svfloat32_t zero31 = svdup_n_f32(0); + svfloat32_t v31 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero31, v1839, v30, 0), + v1839, v30, 90); + svfloat32_t zero45 = svdup_n_f32(0); + svfloat32_t v45 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero45, v1848, v44, 0), + v1848, v44, 90); + svfloat32_t zero59 = svdup_n_f32(0); + svfloat32_t v59 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v1857, v58, 0), + v1857, v58, 90); + svfloat32_t zero73 = svdup_n_f32(0); + svfloat32_t v73 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v1866, v72, 0), + v1866, v72, 90); + svfloat32_t zero101 = svdup_n_f32(0); + svfloat32_t v101 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero101, v1885, v100, 0), v1885, + v100, 90); + svfloat32_t zero115 = svdup_n_f32(0); + svfloat32_t v115 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero115, v1894, v114, 0), v1894, + v114, 90); + svfloat32_t zero129 = svdup_n_f32(0); + svfloat32_t v129 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero129, v1903, v128, 0), v1903, + v128, 90); + svfloat32_t zero143 = svdup_n_f32(0); + svfloat32_t v143 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero143, v1912, v142, 0), v1912, + v142, 90); + svfloat32_t zero157 = svdup_n_f32(0); + svfloat32_t v157 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero157, v1921, v156, 0), v1921, + v156, 90); + svfloat32_t zero171 = svdup_n_f32(0); + svfloat32_t v171 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero171, v1930, v170, 0), v1930, + v170, 90); + svfloat32_t zero185 = svdup_n_f32(0); + svfloat32_t v185 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero185, v1939, v184, 0), v1939, + v184, 90); + svfloat32_t zero199 = svdup_n_f32(0); + svfloat32_t v199 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero199, v1948, v198, 0), v1948, + v198, 90); + svfloat32_t zero213 = svdup_n_f32(0); + svfloat32_t v213 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero213, v1957, v212, 0), v1957, + v212, 90); + svfloat32_t zero227 = svdup_n_f32(0); + svfloat32_t v227 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero227, v1966, v226, 0), v1966, + v226, 90); + svfloat32_t zero241 = svdup_n_f32(0); + svfloat32_t v241 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero241, v1975, v240, 0), v1975, + v240, 90); + svfloat32_t zero255 = svdup_n_f32(0); + svfloat32_t v255 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero255, v1984, v254, 0), v1984, + v254, 90); + svfloat32_t zero269 = svdup_n_f32(0); + svfloat32_t v269 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero269, v1993, v268, 0), v1993, + v268, 90); + svfloat32_t zero283 = svdup_n_f32(0); + svfloat32_t v283 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero283, v2002, v282, 0), v2002, + v282, 90); + svfloat32_t zero297 = svdup_n_f32(0); + svfloat32_t v297 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero297, v2011, v296, 0), v2011, + v296, 90); + svfloat32_t zero311 = svdup_n_f32(0); + svfloat32_t v311 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero311, v2020, v310, 0), v2020, + v310, 90); + svfloat32_t zero325 = svdup_n_f32(0); + svfloat32_t v325 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero325, v2029, v324, 0), v2029, + v324, 90); + svfloat32_t zero339 = svdup_n_f32(0); + svfloat32_t v339 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero339, v2038, v338, 0), v2038, + v338, 90); + svfloat32_t zero353 = svdup_n_f32(0); + svfloat32_t v353 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero353, v2047, v352, 0), v2047, + v352, 90); + svfloat32_t v373 = svcmla_f32_x(pred_full, v31, v2161, v31, 90); + svfloat32_t v386 = svcmla_f32_x(pred_full, v45, v2161, v45, 90); + svfloat32_t v399 = svcmla_f32_x(pred_full, v73, v2161, v73, 90); + svfloat32_t v419 = svcmla_f32_x(pred_full, v59, v2161, v59, 90); + svfloat32_t v500 = svcmla_f32_x(pred_full, v101, v2161, v101, 90); + svfloat32_t v513 = svcmla_f32_x(pred_full, v115, v2161, v115, 90); + svfloat32_t v526 = svcmla_f32_x(pred_full, v143, v2161, v143, 90); + svfloat32_t v546 = svcmla_f32_x(pred_full, v129, v2161, v129, 90); + svfloat32_t v627 = svcmla_f32_x(pred_full, v171, v2161, v171, 90); + svfloat32_t v640 = svcmla_f32_x(pred_full, v185, v2161, v185, 90); + svfloat32_t v653 = svcmla_f32_x(pred_full, v213, v2161, v213, 90); + svfloat32_t v673 = svcmla_f32_x(pred_full, v199, v2161, v199, 90); + svfloat32_t v754 = svcmla_f32_x(pred_full, v241, v2161, v241, 90); + svfloat32_t v767 = svcmla_f32_x(pred_full, v255, v2161, v255, 90); + svfloat32_t v780 = svcmla_f32_x(pred_full, v283, v2161, v283, 90); + svfloat32_t v800 = svcmla_f32_x(pred_full, v269, v2161, v269, 90); + svfloat32_t v881 = svcmla_f32_x(pred_full, v311, v2161, v311, 90); + svfloat32_t v894 = svcmla_f32_x(pred_full, v325, v2161, v325, 90); + svfloat32_t v907 = svcmla_f32_x(pred_full, v353, v2161, v353, 90); + svfloat32_t v927 = svcmla_f32_x(pred_full, v339, v2161, v339, 90); + svfloat32_t v400 = svsub_f32_x(svptrue_b32(), v373, v399); + svfloat32_t v420 = svsub_f32_x(svptrue_b32(), v386, v419); + svfloat32_t v527 = svsub_f32_x(svptrue_b32(), v500, v526); + svfloat32_t v547 = svsub_f32_x(svptrue_b32(), v513, v546); + svfloat32_t v654 = svsub_f32_x(svptrue_b32(), v627, v653); + svfloat32_t v674 = svsub_f32_x(svptrue_b32(), v640, v673); + svfloat32_t v781 = svsub_f32_x(svptrue_b32(), v754, v780); + svfloat32_t v801 = svsub_f32_x(svptrue_b32(), v767, v800); + svfloat32_t v908 = svsub_f32_x(svptrue_b32(), v881, v907); + svfloat32_t v928 = svsub_f32_x(svptrue_b32(), v894, v927); + svfloat32_t v406 = svnmls_f32_x(pred_full, v400, v373, v2464); + svfloat32_t v426 = svnmls_f32_x(pred_full, v420, v386, v2464); + svfloat32_t v533 = svnmls_f32_x(pred_full, v527, v500, v2464); + svfloat32_t v553 = svnmls_f32_x(pred_full, v547, v513, v2464); + svfloat32_t v660 = svnmls_f32_x(pred_full, v654, v627, v2464); + svfloat32_t v680 = svnmls_f32_x(pred_full, v674, v640, v2464); + svfloat32_t v787 = svnmls_f32_x(pred_full, v781, v754, v2464); + svfloat32_t v807 = svnmls_f32_x(pred_full, v801, v767, v2464); + svfloat32_t v914 = svnmls_f32_x(pred_full, v908, v881, v2464); + svfloat32_t v934 = svnmls_f32_x(pred_full, v928, v894, v2464); + svfloat32_t v427 = svadd_f32_x(svptrue_b32(), v406, v426); + svfloat32_t v428 = svsub_f32_x(svptrue_b32(), v406, v426); + svfloat32_t v440 = svmla_f32_x(pred_full, v400, v420, v2424); + svfloat32_t v458 = svnmls_f32_x(pred_full, v420, v400, v2424); + svfloat32_t v554 = svadd_f32_x(svptrue_b32(), v533, v553); + svfloat32_t v555 = svsub_f32_x(svptrue_b32(), v533, v553); + svfloat32_t v567 = svmla_f32_x(pred_full, v527, v547, v2424); + svfloat32_t v585 = svnmls_f32_x(pred_full, v547, v527, v2424); + svfloat32_t v681 = svadd_f32_x(svptrue_b32(), v660, v680); + svfloat32_t v682 = svsub_f32_x(svptrue_b32(), v660, v680); + svfloat32_t v694 = svmla_f32_x(pred_full, v654, v674, v2424); + svfloat32_t v712 = svnmls_f32_x(pred_full, v674, v654, v2424); + svfloat32_t v808 = svadd_f32_x(svptrue_b32(), v787, v807); + svfloat32_t v809 = svsub_f32_x(svptrue_b32(), v787, v807); + svfloat32_t v821 = svmla_f32_x(pred_full, v781, v801, v2424); + svfloat32_t v839 = svnmls_f32_x(pred_full, v801, v781, v2424); + svfloat32_t v935 = svadd_f32_x(svptrue_b32(), v914, v934); + svfloat32_t v936 = svsub_f32_x(svptrue_b32(), v914, v934); + svfloat32_t v948 = svmla_f32_x(pred_full, v908, v928, v2424); + svfloat32_t v966 = svnmls_f32_x(pred_full, v928, v908, v2424); + svfloat32_t v459 = svadd_f32_x(svptrue_b32(), v2057, v427); + svfloat32_t zero466 = svdup_n_f32(0); + svfloat32_t v466 = svcmla_f32_x(pred_full, zero466, v2444, v440, 90); + svfloat32_t zero474 = svdup_n_f32(0); + svfloat32_t v474 = svcmla_f32_x(pred_full, zero474, v2444, v458, 90); + svfloat32_t v586 = svadd_f32_x(svptrue_b32(), v87, v554); + svfloat32_t zero593 = svdup_n_f32(0); + svfloat32_t v593 = svcmla_f32_x(pred_full, zero593, v2444, v567, 90); + svfloat32_t zero601 = svdup_n_f32(0); + svfloat32_t v601 = svcmla_f32_x(pred_full, zero601, v2444, v585, 90); + svfloat32_t v713 = svadd_f32_x(svptrue_b32(), v157, v681); + svfloat32_t zero720 = svdup_n_f32(0); + svfloat32_t v720 = svcmla_f32_x(pred_full, zero720, v2444, v694, 90); + svfloat32_t zero728 = svdup_n_f32(0); + svfloat32_t v728 = svcmla_f32_x(pred_full, zero728, v2444, v712, 90); + svfloat32_t v840 = svadd_f32_x(svptrue_b32(), v227, v808); + svfloat32_t zero847 = svdup_n_f32(0); + svfloat32_t v847 = svcmla_f32_x(pred_full, zero847, v2444, v821, 90); + svfloat32_t zero855 = svdup_n_f32(0); + svfloat32_t v855 = svcmla_f32_x(pred_full, zero855, v2444, v839, 90); + svfloat32_t v967 = svadd_f32_x(svptrue_b32(), v297, v935); + svfloat32_t zero974 = svdup_n_f32(0); + svfloat32_t v974 = svcmla_f32_x(pred_full, zero974, v2444, v948, 90); + svfloat32_t zero982 = svdup_n_f32(0); + svfloat32_t v982 = svcmla_f32_x(pred_full, zero982, v2444, v966, 90); + svfloat32_t v434 = svmls_f32_x(pred_full, v2057, v427, v2420); + svfloat32_t v561 = svmls_f32_x(pred_full, v87, v554, v2420); + svfloat32_t v688 = svmls_f32_x(pred_full, v157, v681, v2420); + svfloat32_t v815 = svmls_f32_x(pred_full, v227, v808, v2420); + svfloat32_t v942 = svmls_f32_x(pred_full, v297, v935, v2420); + svfloat32_t v446 = svmls_f32_x(pred_full, v434, v428, v2422); + svfloat32_t v573 = svmls_f32_x(pred_full, v561, v555, v2422); + svfloat32_t v700 = svmls_f32_x(pred_full, v688, v682, v2422); + svfloat32_t v827 = svmls_f32_x(pred_full, v815, v809, v2422); + svfloat32_t v954 = svmls_f32_x(pred_full, v942, v936, v2422); + svfloat32_t v1008 = svcmla_f32_x(pred_full, v586, v2161, v586, 90); + svfloat32_t v1021 = svcmla_f32_x(pred_full, v713, v2161, v713, 90); + svfloat32_t v1034 = svcmla_f32_x(pred_full, v967, v2161, v967, 90); + svfloat32_t v1054 = svcmla_f32_x(pred_full, v840, v2161, v840, 90); + svfloat32_t v452 = svnmls_f32_x(pred_full, v446, v434, v2464); + svfloat32_t v475 = svsub_f32_x(svptrue_b32(), v446, v474); + svfloat32_t v579 = svnmls_f32_x(pred_full, v573, v561, v2464); + svfloat32_t v602 = svsub_f32_x(svptrue_b32(), v573, v601); + svfloat32_t v706 = svnmls_f32_x(pred_full, v700, v688, v2464); + svfloat32_t v729 = svsub_f32_x(svptrue_b32(), v700, v728); + svfloat32_t v833 = svnmls_f32_x(pred_full, v827, v815, v2464); + svfloat32_t v856 = svsub_f32_x(svptrue_b32(), v827, v855); + svfloat32_t v960 = svnmls_f32_x(pred_full, v954, v942, v2464); + svfloat32_t v983 = svsub_f32_x(svptrue_b32(), v954, v982); + svfloat32_t v1035 = svsub_f32_x(svptrue_b32(), v1008, v1034); + svfloat32_t v1055 = svsub_f32_x(svptrue_b32(), v1021, v1054); + svfloat32_t v467 = svsub_f32_x(svptrue_b32(), v452, v466); + svfloat32_t v481 = svnmls_f32_x(pred_full, v475, v446, v2464); + svfloat32_t v594 = svsub_f32_x(svptrue_b32(), v579, v593); + svfloat32_t v608 = svnmls_f32_x(pred_full, v602, v573, v2464); + svfloat32_t v721 = svsub_f32_x(svptrue_b32(), v706, v720); + svfloat32_t v735 = svnmls_f32_x(pred_full, v729, v700, v2464); + svfloat32_t v848 = svsub_f32_x(svptrue_b32(), v833, v847); + svfloat32_t v862 = svnmls_f32_x(pred_full, v856, v827, v2464); + svfloat32_t v975 = svsub_f32_x(svptrue_b32(), v960, v974); + svfloat32_t v989 = svnmls_f32_x(pred_full, v983, v954, v2464); + svfloat32_t v1041 = svnmls_f32_x(pred_full, v1035, v1008, v2464); + svfloat32_t v1061 = svnmls_f32_x(pred_full, v1055, v1021, v2464); + svfloat32_t v1334 = svmul_f32_x(svptrue_b32(), v602, v2282); + svfloat32_t v1347 = svmul_f32_x(svptrue_b32(), v729, v2410); + svfloat32_t v1360 = svmul_f32_x(svptrue_b32(), v983, v2412); + svfloat32_t v1380 = svmul_f32_x(svptrue_b32(), v856, v2348); + svfloat32_t v487 = svnmls_f32_x(pred_full, v467, v452, v2464); + svfloat32_t v614 = svnmls_f32_x(pred_full, v594, v579, v2464); + svfloat32_t v741 = svnmls_f32_x(pred_full, v721, v706, v2464); + svfloat32_t v868 = svnmls_f32_x(pred_full, v848, v833, v2464); + svfloat32_t v995 = svnmls_f32_x(pred_full, v975, v960, v2464); + svfloat32_t v1062 = svadd_f32_x(svptrue_b32(), v1041, v1061); + svfloat32_t v1063 = svsub_f32_x(svptrue_b32(), v1041, v1061); + svfloat32_t v1075 = svmla_f32_x(pred_full, v1035, v1055, v2424); + svfloat32_t v1093 = svnmls_f32_x(pred_full, v1055, v1035, v2424); + svfloat32_t v1167 = svmul_f32_x(svptrue_b32(), v594, v2218); + svfloat32_t v1180 = svmul_f32_x(svptrue_b32(), v721, v2282); + svfloat32_t v1193 = svmul_f32_x(svptrue_b32(), v975, v2410); + svfloat32_t v1213 = svmul_f32_x(svptrue_b32(), v848, v2346); + svfloat32_t v1342 = svcmla_f32_x(pred_full, v1334, v2283, v602, 90); + svfloat32_t v1355 = svcmla_f32_x(pred_full, v1347, v2411, v729, 90); + svfloat32_t v1368 = svcmla_f32_x(pred_full, v1360, v2413, v983, 90); + svfloat32_t v1388 = svcmla_f32_x(pred_full, v1380, v2349, v856, 90); + svfloat32_t v1501 = svmul_f32_x(svptrue_b32(), v608, v2346); + svfloat32_t v1514 = svmul_f32_x(svptrue_b32(), v735, v2348); + svfloat32_t v1527 = svmul_f32_x(svptrue_b32(), v989, v2417); + svfloat32_t v1547 = svmul_f32_x(svptrue_b32(), v862, v2414); + svfloat32_t v1094 = svadd_f32_x(svptrue_b32(), v459, v1062); + svfloat32_t zero1109 = svdup_n_f32(0); + svfloat32_t v1109 = svcmla_f32_x(pred_full, zero1109, v2444, v1075, 90); + svfloat32_t zero1125 = svdup_n_f32(0); + svfloat32_t v1125 = svcmla_f32_x(pred_full, zero1125, v2444, v1093, 90); + svfloat32_t v1175 = svcmla_f32_x(pred_full, v1167, v2219, v594, 90); + svfloat32_t v1188 = svcmla_f32_x(pred_full, v1180, v2283, v721, 90); + svfloat32_t v1201 = svcmla_f32_x(pred_full, v1193, v2411, v975, 90); + svfloat32_t v1221 = svcmla_f32_x(pred_full, v1213, v2347, v848, 90); + svfloat32_t v1369 = svsub_f32_x(svptrue_b32(), v1342, v1368); + svfloat32_t v1389 = svsub_f32_x(svptrue_b32(), v1355, v1388); + svfloat32_t v1509 = svcmla_f32_x(pred_full, v1501, v2347, v608, 90); + svfloat32_t v1522 = svcmla_f32_x(pred_full, v1514, v2349, v735, 90); + svfloat32_t v1535 = svcmla_f32_x(pred_full, v1527, v2418, v989, 90); + svfloat32_t v1555 = svcmla_f32_x(pred_full, v1547, v2354, v862, 90); + svfloat32_t v1668 = svmul_f32_x(svptrue_b32(), v614, v2410); + svfloat32_t v1681 = svmul_f32_x(svptrue_b32(), v741, v2412); + svfloat32_t v1694 = svmul_f32_x(svptrue_b32(), v995, v2414); + svfloat32_t v1714 = svmul_f32_x(svptrue_b32(), v868, v2417); + svfloat32_t v1069 = svmls_f32_x(pred_full, v459, v1062, v2420); + svint16_t v1097 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1094, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1202 = svsub_f32_x(svptrue_b32(), v1175, v1201); + svfloat32_t v1222 = svsub_f32_x(svptrue_b32(), v1188, v1221); + svfloat32_t v1375 = svnmls_f32_x(pred_full, v1369, v1342, v2464); + svfloat32_t v1395 = svnmls_f32_x(pred_full, v1389, v1355, v2464); + svfloat32_t v1536 = svsub_f32_x(svptrue_b32(), v1509, v1535); + svfloat32_t v1556 = svsub_f32_x(svptrue_b32(), v1522, v1555); + svfloat32_t v1676 = svcmla_f32_x(pred_full, v1668, v2411, v614, 90); + svfloat32_t v1689 = svcmla_f32_x(pred_full, v1681, v2413, v741, 90); + svfloat32_t v1702 = svcmla_f32_x(pred_full, v1694, v2415, v995, 90); + svfloat32_t v1722 = svcmla_f32_x(pred_full, v1714, v2418, v868, 90); + svfloat32_t v1081 = svmls_f32_x(pred_full, v1069, v1063, v2422); + svfloat32_t v1208 = svnmls_f32_x(pred_full, v1202, v1175, v2464); + svfloat32_t v1228 = svnmls_f32_x(pred_full, v1222, v1188, v2464); + svfloat32_t v1396 = svadd_f32_x(svptrue_b32(), v1375, v1395); + svfloat32_t v1397 = svsub_f32_x(svptrue_b32(), v1375, v1395); + svfloat32_t v1409 = svmla_f32_x(pred_full, v1369, v1389, v2424); + svfloat32_t v1427 = svnmls_f32_x(pred_full, v1389, v1369, v2424); + svfloat32_t v1542 = svnmls_f32_x(pred_full, v1536, v1509, v2464); + svfloat32_t v1562 = svnmls_f32_x(pred_full, v1556, v1522, v2464); + svfloat32_t v1703 = svsub_f32_x(svptrue_b32(), v1676, v1702); + svfloat32_t v1723 = svsub_f32_x(svptrue_b32(), v1689, v1722); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2175), v2472, + svreinterpret_u64_s16(v1097)); + svfloat32_t v1087 = svnmls_f32_x(pred_full, v1081, v1069, v2464); + svfloat32_t v1126 = svsub_f32_x(svptrue_b32(), v1081, v1125); + svfloat32_t v1229 = svadd_f32_x(svptrue_b32(), v1208, v1228); + svfloat32_t v1230 = svsub_f32_x(svptrue_b32(), v1208, v1228); + svfloat32_t v1242 = svmla_f32_x(pred_full, v1202, v1222, v2424); + svfloat32_t v1260 = svnmls_f32_x(pred_full, v1222, v1202, v2424); + svfloat32_t v1428 = svadd_f32_x(svptrue_b32(), v475, v1396); + svfloat32_t zero1443 = svdup_n_f32(0); + svfloat32_t v1443 = svcmla_f32_x(pred_full, zero1443, v2444, v1409, 90); + svfloat32_t zero1459 = svdup_n_f32(0); + svfloat32_t v1459 = svcmla_f32_x(pred_full, zero1459, v2444, v1427, 90); + svfloat32_t v1563 = svadd_f32_x(svptrue_b32(), v1542, v1562); + svfloat32_t v1564 = svsub_f32_x(svptrue_b32(), v1542, v1562); + svfloat32_t v1576 = svmla_f32_x(pred_full, v1536, v1556, v2424); + svfloat32_t v1594 = svnmls_f32_x(pred_full, v1556, v1536, v2424); + svfloat32_t v1709 = svnmls_f32_x(pred_full, v1703, v1676, v2464); + svfloat32_t v1729 = svnmls_f32_x(pred_full, v1723, v1689, v2464); + svfloat32_t v1110 = svsub_f32_x(svptrue_b32(), v1087, v1109); + svint16_t v1129 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1126, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1140 = svnmls_f32_x(pred_full, v1126, v1081, v2464); + svfloat32_t v1261 = svadd_f32_x(svptrue_b32(), v467, v1229); + svfloat32_t zero1276 = svdup_n_f32(0); + svfloat32_t v1276 = svcmla_f32_x(pred_full, zero1276, v2444, v1242, 90); + svfloat32_t zero1292 = svdup_n_f32(0); + svfloat32_t v1292 = svcmla_f32_x(pred_full, zero1292, v2444, v1260, 90); + svfloat32_t v1403 = svmls_f32_x(pred_full, v475, v1396, v2420); + svint16_t v1431 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1428, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1595 = svadd_f32_x(svptrue_b32(), v481, v1563); + svfloat32_t zero1610 = svdup_n_f32(0); + svfloat32_t v1610 = svcmla_f32_x(pred_full, zero1610, v2444, v1576, 90); + svfloat32_t zero1626 = svdup_n_f32(0); + svfloat32_t v1626 = svcmla_f32_x(pred_full, zero1626, v2444, v1594, 90); + svfloat32_t v1730 = svadd_f32_x(svptrue_b32(), v1709, v1729); + svfloat32_t v1731 = svsub_f32_x(svptrue_b32(), v1709, v1729); + svfloat32_t v1743 = svmla_f32_x(pred_full, v1703, v1723, v2424); + svfloat32_t v1761 = svnmls_f32_x(pred_full, v1723, v1703, v2424); + svint16_t v1113 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1110, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1143 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1140, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1154 = svnmls_f32_x(pred_full, v1110, v1087, v2464); + svfloat32_t v1236 = svmls_f32_x(pred_full, v467, v1229, v2420); + svint16_t v1264 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1261, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1415 = svmls_f32_x(pred_full, v1403, v1397, v2422); + svfloat32_t v1570 = svmls_f32_x(pred_full, v481, v1563, v2420); + svint16_t v1598 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1595, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1762 = svadd_f32_x(svptrue_b32(), v487, v1730); + svfloat32_t zero1777 = svdup_n_f32(0); + svfloat32_t v1777 = svcmla_f32_x(pred_full, zero1777, v2444, v1743, 90); + svfloat32_t zero1793 = svdup_n_f32(0); + svfloat32_t v1793 = svcmla_f32_x(pred_full, zero1793, v2444, v1761, 90); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2195), v2472, + svreinterpret_u64_s16(v1129)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2303), v2472, + svreinterpret_u64_s16(v1431)); + svint16_t v1157 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1154, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1248 = svmls_f32_x(pred_full, v1236, v1230, v2422); + svfloat32_t v1421 = svnmls_f32_x(pred_full, v1415, v1403, v2464); + svfloat32_t v1460 = svsub_f32_x(svptrue_b32(), v1415, v1459); + svfloat32_t v1582 = svmls_f32_x(pred_full, v1570, v1564, v2422); + svfloat32_t v1737 = svmls_f32_x(pred_full, v487, v1730, v2420); + svint16_t v1765 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1762, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2185), v2472, + svreinterpret_u64_s16(v1113)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2205), v2472, + svreinterpret_u64_s16(v1143)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2239), v2472, + svreinterpret_u64_s16(v1264)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2367), v2472, + svreinterpret_u64_s16(v1598)); + svfloat32_t v1254 = svnmls_f32_x(pred_full, v1248, v1236, v2464); + svfloat32_t v1293 = svsub_f32_x(svptrue_b32(), v1248, v1292); + svfloat32_t v1444 = svsub_f32_x(svptrue_b32(), v1421, v1443); + svint16_t v1463 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1460, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1474 = svnmls_f32_x(pred_full, v1460, v1415, v2464); + svfloat32_t v1588 = svnmls_f32_x(pred_full, v1582, v1570, v2464); + svfloat32_t v1627 = svsub_f32_x(svptrue_b32(), v1582, v1626); + svfloat32_t v1749 = svmls_f32_x(pred_full, v1737, v1731, v2422); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2215), v2472, + svreinterpret_u64_s16(v1157)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2431), v2472, + svreinterpret_u64_s16(v1765)); + svfloat32_t v1277 = svsub_f32_x(svptrue_b32(), v1254, v1276); + svint16_t v1296 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1293, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1307 = svnmls_f32_x(pred_full, v1293, v1248, v2464); + svint16_t v1447 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1444, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1477 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1474, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1488 = svnmls_f32_x(pred_full, v1444, v1421, v2464); + svfloat32_t v1611 = svsub_f32_x(svptrue_b32(), v1588, v1610); + svint16_t v1630 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1627, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1641 = svnmls_f32_x(pred_full, v1627, v1582, v2464); + svfloat32_t v1755 = svnmls_f32_x(pred_full, v1749, v1737, v2464); + svfloat32_t v1794 = svsub_f32_x(svptrue_b32(), v1749, v1793); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2323), v2472, + svreinterpret_u64_s16(v1463)); + svint16_t v1280 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1277, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1310 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1307, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1321 = svnmls_f32_x(pred_full, v1277, v1254, v2464); + svint16_t v1491 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1488, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1614 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1611, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1644 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1641, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1655 = svnmls_f32_x(pred_full, v1611, v1588, v2464); + svfloat32_t v1778 = svsub_f32_x(svptrue_b32(), v1755, v1777); + svint16_t v1797 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1794, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1808 = svnmls_f32_x(pred_full, v1794, v1749, v2464); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2259), v2472, + svreinterpret_u64_s16(v1296)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2313), v2472, + svreinterpret_u64_s16(v1447)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2333), v2472, + svreinterpret_u64_s16(v1477)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2387), v2472, + svreinterpret_u64_s16(v1630)); + svint16_t v1324 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1321, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1658 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1655, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1781 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1778, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1811 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1808, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1822 = svnmls_f32_x(pred_full, v1778, v1755, v2464); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2249), v2472, + svreinterpret_u64_s16(v1280)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2269), v2472, + svreinterpret_u64_s16(v1310)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2343), v2472, + svreinterpret_u64_s16(v1491)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2377), v2472, + svreinterpret_u64_s16(v1614)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2397), v2472, + svreinterpret_u64_s16(v1644)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2451), v2472, + svreinterpret_u64_s16(v1797)); + svint16_t v1825 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1822, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2279), v2472, + svreinterpret_u64_s16(v1324)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2407), v2472, + svreinterpret_u64_s16(v1658)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2441), v2472, + svreinterpret_u64_s16(v1781)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2461), v2472, + svreinterpret_u64_s16(v1811)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2471), v2472, + svreinterpret_u64_s16(v1825)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs32(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v407 = v5[istride]; + float v1434 = 7.0710678118654757e-01F; + float v1445 = -7.0710678118654746e-01F; + float v1495 = 5.5557023301960229e-01F; + float v1509 = -1.9509032201612861e-01F; + float v1560 = 9.2387953251128674e-01F; + float v1567 = -9.2387953251128685e-01F; + float v1570 = 3.8268343236508967e-01F; + float v1571 = -3.8268343236508967e-01F; + float v1617 = 1.9509032201612833e-01F; + float v1620 = -9.8078528040323043e-01F; + float v1621 = 9.8078528040323043e-01F; + float v1628 = -5.5557023301960218e-01F; + float v1631 = 8.3146961230254524e-01F; + float v1632 = -8.3146961230254524e-01F; + float v1642 = -1.0000000000000000e+00F; + float v1643 = 1.0000000000000000e+00F; + float32x2_t v1645 = (float32x2_t){v4, v4}; + float32x2_t v444 = vtrn1_f32(v407, v407); + float32x2_t v445 = vtrn2_f32(v407, v407); + float32x2_t v851 = v5[0]; + float32x2_t v1252 = (float32x2_t){v1621, v1621}; + float32x2_t v1313 = (float32x2_t){v1560, v1560}; + float32x2_t v1317 = (float32x2_t){v1571, v1570}; + float32x2_t v1374 = (float32x2_t){v1631, v1631}; + float32x2_t v1378 = (float32x2_t){v1628, v1495}; + float32x2_t v1385 = (float32x2_t){v1509, v1509}; + float32x2_t v1435 = (float32x2_t){v1434, v1434}; + float32x2_t v1446 = (float32x2_t){v1445, v1445}; + float32x2_t v1450 = (float32x2_t){v1643, v1642}; + float32x2_t v1496 = (float32x2_t){v1495, v1495}; + float32x2_t v1500 = (float32x2_t){v1632, v1631}; + float32x2_t v1507 = (float32x2_t){v1620, v1620}; + float32x2_t v1511 = (float32x2_t){v1509, v1617}; + float32x2_t v1557 = (float32x2_t){v1570, v1570}; + float32x2_t v1561 = (float32x2_t){v1567, v1560}; + float32x2_t v1568 = (float32x2_t){v1567, v1567}; + float32x2_t v1572 = (float32x2_t){v1570, v1571}; + float32x2_t v1618 = (float32x2_t){v1617, v1617}; + float32x2_t v1622 = (float32x2_t){v1620, v1621}; + float32x2_t v1629 = (float32x2_t){v1628, v1628}; + float32x2_t v1633 = (float32x2_t){v1631, v1632}; + float32x2_t v1644 = (float32x2_t){v1642, v1643}; + float32x2_t v20 = v5[istride * 16]; + int64_t v37 = 30 + j * 62; + float32x2_t v51 = v5[istride * 8]; + int64_t v55 = 14 + j * 62; + float32x2_t v69 = v5[istride * 24]; + int64_t v73 = 46 + j * 62; + float32x2_t v87 = v5[istride * 4]; + float32x2_t v105 = v5[istride * 20]; + int64_t v122 = 6 + j * 62; + int64_t v135 = 38 + j * 62; + float32x2_t v149 = v5[istride * 12]; + float32x2_t v167 = v5[istride * 28]; + int64_t v184 = 22 + j * 62; + int64_t v197 = 54 + j * 62; + float32x2_t v211 = v5[istride * 2]; + float32x2_t v229 = v5[istride * 18]; + int64_t v246 = 2 + j * 62; + int64_t v259 = 34 + j * 62; + float32x2_t v273 = v5[istride * 10]; + int64_t v277 = 18 + j * 62; + float32x2_t v291 = v5[istride * 26]; + int64_t v295 = 50 + j * 62; + float32x2_t v309 = v5[istride * 6]; + float32x2_t v327 = v5[istride * 22]; + int64_t v344 = 10 + j * 62; + int64_t v357 = 42 + j * 62; + float32x2_t v371 = v5[istride * 14]; + int64_t v375 = 26 + j * 62; + float32x2_t v389 = v5[istride * 30]; + int64_t v393 = 58 + j * 62; + float32x2_t v425 = v5[istride * 17]; + float32x2_t v443 = v7[j * 62]; + int64_t v447 = j * 62 + 1; + int64_t v455 = 32 + j * 62; + float32x2_t v469 = v5[istride * 9]; + int64_t v473 = 16 + j * 62; + float32x2_t v487 = v5[istride * 25]; + int64_t v491 = 48 + j * 62; + float32x2_t v505 = v5[istride * 5]; + float32x2_t v523 = v5[istride * 21]; + int64_t v540 = 8 + j * 62; + int64_t v553 = 40 + j * 62; + float32x2_t v567 = v5[istride * 13]; + float32x2_t v585 = v5[istride * 29]; + int64_t v602 = 24 + j * 62; + int64_t v615 = 56 + j * 62; + float32x2_t v629 = v5[istride * 3]; + float32x2_t v647 = v5[istride * 19]; + int64_t v664 = 4 + j * 62; + int64_t v677 = 36 + j * 62; + float32x2_t v691 = v5[istride * 11]; + int64_t v695 = 20 + j * 62; + float32x2_t v709 = v5[istride * 27]; + int64_t v713 = 52 + j * 62; + float32x2_t v727 = v5[istride * 7]; + float32x2_t v745 = v5[istride * 23]; + int64_t v762 = 12 + j * 62; + int64_t v775 = 44 + j * 62; + float32x2_t v789 = v5[istride * 15]; + float32x2_t v807 = v5[istride * 31]; + int64_t v824 = 28 + j * 62; + int64_t v837 = 60 + j * 62; + float32x2_t v1319 = vmul_f32(v1645, v1317); + float32x2_t v1380 = vmul_f32(v1645, v1378); + float32x2_t v1452 = vmul_f32(v1645, v1450); + float32x2_t v1502 = vmul_f32(v1645, v1500); + float32x2_t v1513 = vmul_f32(v1645, v1511); + float32x2_t v1563 = vmul_f32(v1645, v1561); + float32x2_t v1574 = vmul_f32(v1645, v1572); + float32x2_t v1624 = vmul_f32(v1645, v1622); + float32x2_t v1635 = vmul_f32(v1645, v1633); + float32x2_t v1646 = vmul_f32(v1645, v1644); + float32x2_t v38 = v7[v37]; + float32x2_t v39 = vtrn1_f32(v20, v20); + float32x2_t v40 = vtrn2_f32(v20, v20); + int64_t v42 = v37 + 1; + float32x2_t v56 = v7[v55]; + float32x2_t v57 = vtrn1_f32(v51, v51); + float32x2_t v58 = vtrn2_f32(v51, v51); + int64_t v60 = v55 + 1; + float32x2_t v74 = v7[v73]; + float32x2_t v75 = vtrn1_f32(v69, v69); + float32x2_t v76 = vtrn2_f32(v69, v69); + int64_t v78 = v73 + 1; + float32x2_t v123 = v7[v122]; + float32x2_t v124 = vtrn1_f32(v87, v87); + float32x2_t v125 = vtrn2_f32(v87, v87); + int64_t v127 = v122 + 1; + float32x2_t v136 = v7[v135]; + float32x2_t v137 = vtrn1_f32(v105, v105); + float32x2_t v138 = vtrn2_f32(v105, v105); + int64_t v140 = v135 + 1; + float32x2_t v185 = v7[v184]; + float32x2_t v186 = vtrn1_f32(v149, v149); + float32x2_t v187 = vtrn2_f32(v149, v149); + int64_t v189 = v184 + 1; + float32x2_t v198 = v7[v197]; + float32x2_t v199 = vtrn1_f32(v167, v167); + float32x2_t v200 = vtrn2_f32(v167, v167); + int64_t v202 = v197 + 1; + float32x2_t v247 = v7[v246]; + float32x2_t v248 = vtrn1_f32(v211, v211); + float32x2_t v249 = vtrn2_f32(v211, v211); + int64_t v251 = v246 + 1; + float32x2_t v260 = v7[v259]; + float32x2_t v261 = vtrn1_f32(v229, v229); + float32x2_t v262 = vtrn2_f32(v229, v229); + int64_t v264 = v259 + 1; + float32x2_t v278 = v7[v277]; + float32x2_t v279 = vtrn1_f32(v273, v273); + float32x2_t v280 = vtrn2_f32(v273, v273); + int64_t v282 = v277 + 1; + float32x2_t v296 = v7[v295]; + float32x2_t v297 = vtrn1_f32(v291, v291); + float32x2_t v298 = vtrn2_f32(v291, v291); + int64_t v300 = v295 + 1; + float32x2_t v345 = v7[v344]; + float32x2_t v346 = vtrn1_f32(v309, v309); + float32x2_t v347 = vtrn2_f32(v309, v309); + int64_t v349 = v344 + 1; + float32x2_t v358 = v7[v357]; + float32x2_t v359 = vtrn1_f32(v327, v327); + float32x2_t v360 = vtrn2_f32(v327, v327); + int64_t v362 = v357 + 1; + float32x2_t v376 = v7[v375]; + float32x2_t v377 = vtrn1_f32(v371, v371); + float32x2_t v378 = vtrn2_f32(v371, v371); + int64_t v380 = v375 + 1; + float32x2_t v394 = v7[v393]; + float32x2_t v395 = vtrn1_f32(v389, v389); + float32x2_t v396 = vtrn2_f32(v389, v389); + int64_t v398 = v393 + 1; + float32x2_t v448 = v7[v447]; + float32x2_t v449 = vmul_f32(v444, v443); + float32x2_t v456 = v7[v455]; + float32x2_t v457 = vtrn1_f32(v425, v425); + float32x2_t v458 = vtrn2_f32(v425, v425); + int64_t v460 = v455 + 1; + float32x2_t v474 = v7[v473]; + float32x2_t v475 = vtrn1_f32(v469, v469); + float32x2_t v476 = vtrn2_f32(v469, v469); + int64_t v478 = v473 + 1; + float32x2_t v492 = v7[v491]; + float32x2_t v493 = vtrn1_f32(v487, v487); + float32x2_t v494 = vtrn2_f32(v487, v487); + int64_t v496 = v491 + 1; + float32x2_t v541 = v7[v540]; + float32x2_t v542 = vtrn1_f32(v505, v505); + float32x2_t v543 = vtrn2_f32(v505, v505); + int64_t v545 = v540 + 1; + float32x2_t v554 = v7[v553]; + float32x2_t v555 = vtrn1_f32(v523, v523); + float32x2_t v556 = vtrn2_f32(v523, v523); + int64_t v558 = v553 + 1; + float32x2_t v603 = v7[v602]; + float32x2_t v604 = vtrn1_f32(v567, v567); + float32x2_t v605 = vtrn2_f32(v567, v567); + int64_t v607 = v602 + 1; + float32x2_t v616 = v7[v615]; + float32x2_t v617 = vtrn1_f32(v585, v585); + float32x2_t v618 = vtrn2_f32(v585, v585); + int64_t v620 = v615 + 1; + float32x2_t v665 = v7[v664]; + float32x2_t v666 = vtrn1_f32(v629, v629); + float32x2_t v667 = vtrn2_f32(v629, v629); + int64_t v669 = v664 + 1; + float32x2_t v678 = v7[v677]; + float32x2_t v679 = vtrn1_f32(v647, v647); + float32x2_t v680 = vtrn2_f32(v647, v647); + int64_t v682 = v677 + 1; + float32x2_t v696 = v7[v695]; + float32x2_t v697 = vtrn1_f32(v691, v691); + float32x2_t v698 = vtrn2_f32(v691, v691); + int64_t v700 = v695 + 1; + float32x2_t v714 = v7[v713]; + float32x2_t v715 = vtrn1_f32(v709, v709); + float32x2_t v716 = vtrn2_f32(v709, v709); + int64_t v718 = v713 + 1; + float32x2_t v763 = v7[v762]; + float32x2_t v764 = vtrn1_f32(v727, v727); + float32x2_t v765 = vtrn2_f32(v727, v727); + int64_t v767 = v762 + 1; + float32x2_t v776 = v7[v775]; + float32x2_t v777 = vtrn1_f32(v745, v745); + float32x2_t v778 = vtrn2_f32(v745, v745); + int64_t v780 = v775 + 1; + float32x2_t v825 = v7[v824]; + float32x2_t v826 = vtrn1_f32(v789, v789); + float32x2_t v827 = vtrn2_f32(v789, v789); + int64_t v829 = v824 + 1; + float32x2_t v838 = v7[v837]; + float32x2_t v839 = vtrn1_f32(v807, v807); + float32x2_t v840 = vtrn2_f32(v807, v807); + int64_t v842 = v837 + 1; + float32x2_t v43 = v7[v42]; + float32x2_t v44 = vmul_f32(v39, v38); + float32x2_t v61 = v7[v60]; + float32x2_t v62 = vmul_f32(v57, v56); + float32x2_t v79 = v7[v78]; + float32x2_t v80 = vmul_f32(v75, v74); + float32x2_t v128 = v7[v127]; + float32x2_t v129 = vmul_f32(v124, v123); + float32x2_t v141 = v7[v140]; + float32x2_t v142 = vmul_f32(v137, v136); + float32x2_t v190 = v7[v189]; + float32x2_t v191 = vmul_f32(v186, v185); + float32x2_t v203 = v7[v202]; + float32x2_t v204 = vmul_f32(v199, v198); + float32x2_t v252 = v7[v251]; + float32x2_t v253 = vmul_f32(v248, v247); + float32x2_t v265 = v7[v264]; + float32x2_t v266 = vmul_f32(v261, v260); + float32x2_t v283 = v7[v282]; + float32x2_t v284 = vmul_f32(v279, v278); + float32x2_t v301 = v7[v300]; + float32x2_t v302 = vmul_f32(v297, v296); + float32x2_t v350 = v7[v349]; + float32x2_t v351 = vmul_f32(v346, v345); + float32x2_t v363 = v7[v362]; + float32x2_t v364 = vmul_f32(v359, v358); + float32x2_t v381 = v7[v380]; + float32x2_t v382 = vmul_f32(v377, v376); + float32x2_t v399 = v7[v398]; + float32x2_t v400 = vmul_f32(v395, v394); + float32x2_t v461 = v7[v460]; + float32x2_t v462 = vmul_f32(v457, v456); + float32x2_t v479 = v7[v478]; + float32x2_t v480 = vmul_f32(v475, v474); + float32x2_t v497 = v7[v496]; + float32x2_t v498 = vmul_f32(v493, v492); + float32x2_t v546 = v7[v545]; + float32x2_t v547 = vmul_f32(v542, v541); + float32x2_t v559 = v7[v558]; + float32x2_t v560 = vmul_f32(v555, v554); + float32x2_t v608 = v7[v607]; + float32x2_t v609 = vmul_f32(v604, v603); + float32x2_t v621 = v7[v620]; + float32x2_t v622 = vmul_f32(v617, v616); + float32x2_t v670 = v7[v669]; + float32x2_t v671 = vmul_f32(v666, v665); + float32x2_t v683 = v7[v682]; + float32x2_t v684 = vmul_f32(v679, v678); + float32x2_t v701 = v7[v700]; + float32x2_t v702 = vmul_f32(v697, v696); + float32x2_t v719 = v7[v718]; + float32x2_t v720 = vmul_f32(v715, v714); + float32x2_t v768 = v7[v767]; + float32x2_t v769 = vmul_f32(v764, v763); + float32x2_t v781 = v7[v780]; + float32x2_t v782 = vmul_f32(v777, v776); + float32x2_t v830 = v7[v829]; + float32x2_t v831 = vmul_f32(v826, v825); + float32x2_t v843 = v7[v842]; + float32x2_t v844 = vmul_f32(v839, v838); + float32x2_t v451 = vfma_f32(v449, v445, v448); + float32x2_t v46 = vfma_f32(v44, v40, v43); + float32x2_t v64 = vfma_f32(v62, v58, v61); + float32x2_t v82 = vfma_f32(v80, v76, v79); + float32x2_t v131 = vfma_f32(v129, v125, v128); + float32x2_t v144 = vfma_f32(v142, v138, v141); + float32x2_t v193 = vfma_f32(v191, v187, v190); + float32x2_t v206 = vfma_f32(v204, v200, v203); + float32x2_t v255 = vfma_f32(v253, v249, v252); + float32x2_t v268 = vfma_f32(v266, v262, v265); + float32x2_t v286 = vfma_f32(v284, v280, v283); + float32x2_t v304 = vfma_f32(v302, v298, v301); + float32x2_t v353 = vfma_f32(v351, v347, v350); + float32x2_t v366 = vfma_f32(v364, v360, v363); + float32x2_t v384 = vfma_f32(v382, v378, v381); + float32x2_t v402 = vfma_f32(v400, v396, v399); + float32x2_t v464 = vfma_f32(v462, v458, v461); + float32x2_t v482 = vfma_f32(v480, v476, v479); + float32x2_t v500 = vfma_f32(v498, v494, v497); + float32x2_t v549 = vfma_f32(v547, v543, v546); + float32x2_t v562 = vfma_f32(v560, v556, v559); + float32x2_t v611 = vfma_f32(v609, v605, v608); + float32x2_t v624 = vfma_f32(v622, v618, v621); + float32x2_t v673 = vfma_f32(v671, v667, v670); + float32x2_t v686 = vfma_f32(v684, v680, v683); + float32x2_t v704 = vfma_f32(v702, v698, v701); + float32x2_t v722 = vfma_f32(v720, v716, v719); + float32x2_t v771 = vfma_f32(v769, v765, v768); + float32x2_t v784 = vfma_f32(v782, v778, v781); + float32x2_t v833 = vfma_f32(v831, v827, v830); + float32x2_t v846 = vfma_f32(v844, v840, v843); + float32x2_t v852 = vadd_f32(v851, v46); + float32x2_t v853 = vsub_f32(v851, v46); + float32x2_t v854 = vadd_f32(v64, v82); + float32x2_t v855 = vsub_f32(v64, v82); + float32x2_t v867 = vadd_f32(v131, v144); + float32x2_t v868 = vsub_f32(v131, v144); + float32x2_t v869 = vadd_f32(v193, v206); + float32x2_t v870 = vsub_f32(v193, v206); + float32x2_t v921 = vadd_f32(v255, v268); + float32x2_t v922 = vsub_f32(v255, v268); + float32x2_t v923 = vadd_f32(v286, v304); + float32x2_t v924 = vsub_f32(v286, v304); + float32x2_t v936 = vadd_f32(v353, v366); + float32x2_t v937 = vsub_f32(v353, v366); + float32x2_t v938 = vadd_f32(v384, v402); + float32x2_t v939 = vsub_f32(v384, v402); + float32x2_t v1075 = vadd_f32(v451, v464); + float32x2_t v1076 = vsub_f32(v451, v464); + float32x2_t v1077 = vadd_f32(v482, v500); + float32x2_t v1078 = vsub_f32(v482, v500); + float32x2_t v1090 = vadd_f32(v549, v562); + float32x2_t v1091 = vsub_f32(v549, v562); + float32x2_t v1092 = vadd_f32(v611, v624); + float32x2_t v1093 = vsub_f32(v611, v624); + float32x2_t v1144 = vadd_f32(v673, v686); + float32x2_t v1145 = vsub_f32(v673, v686); + float32x2_t v1146 = vadd_f32(v704, v722); + float32x2_t v1147 = vsub_f32(v704, v722); + float32x2_t v1159 = vadd_f32(v771, v784); + float32x2_t v1160 = vsub_f32(v771, v784); + float32x2_t v1161 = vadd_f32(v833, v846); + float32x2_t v1162 = vsub_f32(v833, v846); + float32x2_t v861 = vrev64_f32(v855); + float32x2_t v863 = vadd_f32(v852, v854); + float32x2_t v864 = vsub_f32(v852, v854); + float32x2_t v871 = vadd_f32(v867, v869); + float32x2_t v872 = vsub_f32(v867, v869); + float32x2_t v887 = vmul_f32(v868, v1435); + float32x2_t v898 = vmul_f32(v870, v1446); + float32x2_t v930 = vrev64_f32(v924); + float32x2_t v932 = vadd_f32(v921, v923); + float32x2_t v933 = vsub_f32(v921, v923); + float32x2_t v945 = vrev64_f32(v939); + float32x2_t v947 = vadd_f32(v936, v938); + float32x2_t v948 = vsub_f32(v936, v938); + float32x2_t v1084 = vrev64_f32(v1078); + float32x2_t v1086 = vadd_f32(v1075, v1077); + float32x2_t v1087 = vsub_f32(v1075, v1077); + float32x2_t v1094 = vadd_f32(v1090, v1092); + float32x2_t v1095 = vsub_f32(v1090, v1092); + float32x2_t v1110 = vmul_f32(v1091, v1435); + float32x2_t v1121 = vmul_f32(v1093, v1446); + float32x2_t v1153 = vrev64_f32(v1147); + float32x2_t v1155 = vadd_f32(v1144, v1146); + float32x2_t v1156 = vsub_f32(v1144, v1146); + float32x2_t v1163 = vadd_f32(v1159, v1161); + float32x2_t v1164 = vsub_f32(v1159, v1161); + float32x2_t v1179 = vmul_f32(v1160, v1435); + float32x2_t v1190 = vmul_f32(v1162, v1446); + float32x2_t v862 = vmul_f32(v861, v1452); + float32x2_t v878 = vrev64_f32(v872); + float32x2_t v880 = vadd_f32(v863, v871); + float32x2_t v881 = vsub_f32(v863, v871); + float32x2_t v893 = vrev64_f32(v887); + float32x2_t v904 = vrev64_f32(v898); + float32x2_t v931 = vmul_f32(v930, v1452); + float32x2_t v946 = vmul_f32(v945, v1452); + float32x2_t v951 = vadd_f32(v932, v947); + float32x2_t v952 = vsub_f32(v932, v947); + float32x2_t v1004 = vmul_f32(v933, v1435); + float32x2_t v1015 = vmul_f32(v948, v1446); + float32x2_t v1085 = vmul_f32(v1084, v1452); + float32x2_t v1101 = vrev64_f32(v1095); + float32x2_t v1103 = vadd_f32(v1086, v1094); + float32x2_t v1104 = vsub_f32(v1086, v1094); + float32x2_t v1116 = vrev64_f32(v1110); + float32x2_t v1127 = vrev64_f32(v1121); + float32x2_t v1154 = vmul_f32(v1153, v1452); + float32x2_t v1170 = vrev64_f32(v1164); + float32x2_t v1172 = vadd_f32(v1155, v1163); + float32x2_t v1173 = vsub_f32(v1155, v1163); + float32x2_t v1185 = vrev64_f32(v1179); + float32x2_t v1196 = vrev64_f32(v1190); + float32x2_t v865 = vsub_f32(v853, v862); + float32x2_t v866 = vadd_f32(v853, v862); + float32x2_t v879 = vmul_f32(v878, v1452); + float32x2_t v894 = vmul_f32(v893, v1646); + float32x2_t v905 = vmul_f32(v904, v1452); + float32x2_t v934 = vsub_f32(v922, v931); + float32x2_t v935 = vadd_f32(v922, v931); + float32x2_t v949 = vsub_f32(v937, v946); + float32x2_t v950 = vadd_f32(v937, v946); + float32x2_t v958 = vrev64_f32(v952); + float32x2_t v960 = vadd_f32(v880, v951); + float32x2_t v961 = vsub_f32(v880, v951); + float32x2_t v1010 = vrev64_f32(v1004); + float32x2_t v1021 = vrev64_f32(v1015); + float32x2_t v1088 = vsub_f32(v1076, v1085); + float32x2_t v1089 = vadd_f32(v1076, v1085); + float32x2_t v1102 = vmul_f32(v1101, v1452); + float32x2_t v1117 = vmul_f32(v1116, v1646); + float32x2_t v1128 = vmul_f32(v1127, v1452); + float32x2_t v1157 = vsub_f32(v1145, v1154); + float32x2_t v1158 = vadd_f32(v1145, v1154); + float32x2_t v1171 = vmul_f32(v1170, v1452); + float32x2_t v1186 = vmul_f32(v1185, v1646); + float32x2_t v1197 = vmul_f32(v1196, v1452); + float32x2_t v1213 = vadd_f32(v1103, v1172); + float32x2_t v1214 = vsub_f32(v1103, v1172); + float32x2_t v1436 = vmul_f32(v1104, v1435); + float32x2_t v1447 = vmul_f32(v1173, v1446); + float32x2_t v882 = vsub_f32(v864, v879); + float32x2_t v883 = vadd_f32(v864, v879); + float32x2_t v906 = vadd_f32(v887, v894); + float32x2_t v907 = vadd_f32(v898, v905); + float32x2_t v959 = vmul_f32(v958, v1452); + float32x2_t v967 = vmul_f32(v934, v1313); + float32x2_t v973 = vrev64_f32(v934); + float32x2_t v978 = vmul_f32(v949, v1557); + float32x2_t v984 = vrev64_f32(v949); + float32x2_t v1011 = vmul_f32(v1010, v1646); + float32x2_t v1022 = vmul_f32(v1021, v1452); + float32x2_t v1041 = vmul_f32(v935, v1557); + float32x2_t v1047 = vrev64_f32(v935); + float32x2_t v1052 = vmul_f32(v950, v1568); + float32x2_t v1058 = vrev64_f32(v950); + float32x2_t v1105 = vsub_f32(v1087, v1102); + float32x2_t v1106 = vadd_f32(v1087, v1102); + float32x2_t v1129 = vadd_f32(v1110, v1117); + float32x2_t v1130 = vadd_f32(v1121, v1128); + float32x2_t v1174 = vsub_f32(v1156, v1171); + float32x2_t v1175 = vadd_f32(v1156, v1171); + float32x2_t v1198 = vadd_f32(v1179, v1186); + float32x2_t v1199 = vadd_f32(v1190, v1197); + float32x2_t v1220 = vrev64_f32(v1214); + float32x2_t v1222 = vadd_f32(v960, v1213); + float32x2_t v1223 = vsub_f32(v960, v1213); + float32x2_t v1442 = vrev64_f32(v1436); + float32x2_t v1453 = vrev64_f32(v1447); + float32x2_t v908 = vadd_f32(v906, v907); + float32x2_t v909 = vsub_f32(v907, v906); + float32x2_t v962 = vsub_f32(v881, v959); + float32x2_t v963 = vadd_f32(v881, v959); + float32x2_t v1023 = vadd_f32(v1004, v1011); + float32x2_t v1024 = vadd_f32(v1015, v1022); + float32x2_t v1131 = vadd_f32(v1129, v1130); + float32x2_t v1132 = vsub_f32(v1130, v1129); + float32x2_t v1200 = vadd_f32(v1198, v1199); + float32x2_t v1201 = vsub_f32(v1199, v1198); + float32x2_t v1221 = vmul_f32(v1220, v1452); + int16x4_t v1228 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1222, 15), (int32x2_t){0, 0})); + int16x4_t v1240 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1223, 15), (int32x2_t){0, 0})); + float32x2_t v1314 = vmul_f32(v1105, v1313); + float32x2_t v1320 = vrev64_f32(v1105); + float32x2_t v1325 = vmul_f32(v1174, v1557); + float32x2_t v1331 = vrev64_f32(v1174); + float32x2_t v1443 = vmul_f32(v1442, v1646); + float32x2_t v1454 = vmul_f32(v1453, v1452); + float32x2_t v1558 = vmul_f32(v1106, v1557); + float32x2_t v1564 = vrev64_f32(v1106); + float32x2_t v1569 = vmul_f32(v1175, v1568); + float32x2_t v1575 = vrev64_f32(v1175); + float32x2_t v915 = vrev64_f32(v909); + float32x2_t v917 = vadd_f32(v865, v908); + float32x2_t v918 = vsub_f32(v865, v908); + float32x2_t v986 = vfma_f32(v967, v973, v1319); + float32x2_t v987 = vfma_f32(v978, v984, v1563); + float32x2_t v1025 = vadd_f32(v1023, v1024); + float32x2_t v1026 = vsub_f32(v1024, v1023); + float32x2_t v1060 = vfma_f32(v1041, v1047, v1563); + float32x2_t v1061 = vfma_f32(v1052, v1058, v1574); + float32x2_t v1138 = vrev64_f32(v1132); + float32x2_t v1140 = vadd_f32(v1088, v1131); + float32x2_t v1141 = vsub_f32(v1088, v1131); + float32x2_t v1207 = vrev64_f32(v1201); + float32x2_t v1209 = vadd_f32(v1157, v1200); + float32x2_t v1210 = vsub_f32(v1157, v1200); + float32x2_t v1224 = vsub_f32(v961, v1221); + float32x2_t v1225 = vadd_f32(v961, v1221); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v1228), 0); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v1240), 0); + float32x2_t v1455 = vadd_f32(v1436, v1443); + float32x2_t v1456 = vadd_f32(v1447, v1454); + float32x2_t v916 = vmul_f32(v915, v1646); + float32x2_t v988 = vadd_f32(v986, v987); + float32x2_t v989 = vsub_f32(v987, v986); + float32x2_t v1032 = vrev64_f32(v1026); + float32x2_t v1034 = vadd_f32(v882, v1025); + float32x2_t v1035 = vsub_f32(v882, v1025); + float32x2_t v1062 = vadd_f32(v1060, v1061); + float32x2_t v1063 = vsub_f32(v1061, v1060); + float32x2_t v1139 = vmul_f32(v1138, v1646); + float32x2_t v1208 = vmul_f32(v1207, v1646); + int16x4_t v1234 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1224, 15), (int32x2_t){0, 0})); + int16x4_t v1246 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1225, 15), (int32x2_t){0, 0})); + float32x2_t v1253 = vmul_f32(v1140, v1252); + float32x2_t v1259 = vrev64_f32(v1140); + float32x2_t v1264 = vmul_f32(v1209, v1374); + float32x2_t v1270 = vrev64_f32(v1209); + float32x2_t v1333 = vfma_f32(v1314, v1320, v1319); + float32x2_t v1334 = vfma_f32(v1325, v1331, v1563); + float32x2_t v1457 = vadd_f32(v1455, v1456); + float32x2_t v1458 = vsub_f32(v1456, v1455); + float32x2_t v1497 = vmul_f32(v1141, v1496); + float32x2_t v1503 = vrev64_f32(v1141); + float32x2_t v1508 = vmul_f32(v1210, v1507); + float32x2_t v1514 = vrev64_f32(v1210); + float32x2_t v1577 = vfma_f32(v1558, v1564, v1563); + float32x2_t v1578 = vfma_f32(v1569, v1575, v1574); + float32x2_t v919 = vsub_f32(v866, v916); + float32x2_t v920 = vadd_f32(v866, v916); + float32x2_t v995 = vrev64_f32(v989); + float32x2_t v997 = vadd_f32(v917, v988); + float32x2_t v998 = vsub_f32(v917, v988); + float32x2_t v1033 = vmul_f32(v1032, v1646); + float32x2_t v1069 = vrev64_f32(v1063); + float32x2_t v1142 = vsub_f32(v1089, v1139); + float32x2_t v1143 = vadd_f32(v1089, v1139); + float32x2_t v1211 = vsub_f32(v1158, v1208); + float32x2_t v1212 = vadd_f32(v1158, v1208); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v1234), 0); + v6[ostride * 24] = vget_lane_s32(vreinterpret_s32_s16(v1246), 0); + float32x2_t v1335 = vadd_f32(v1333, v1334); + float32x2_t v1336 = vsub_f32(v1334, v1333); + float32x2_t v1464 = vrev64_f32(v1458); + float32x2_t v1466 = vadd_f32(v962, v1457); + float32x2_t v1467 = vsub_f32(v962, v1457); + float32x2_t v1579 = vadd_f32(v1577, v1578); + float32x2_t v1580 = vsub_f32(v1578, v1577); + float32x2_t v996 = vmul_f32(v995, v1646); + float32x2_t v1036 = vsub_f32(v883, v1033); + float32x2_t v1037 = vadd_f32(v883, v1033); + float32x2_t v1070 = vmul_f32(v1069, v1646); + float32x2_t v1071 = vadd_f32(v919, v1062); + float32x2_t v1072 = vsub_f32(v919, v1062); + float32x2_t v1272 = vfma_f32(v1253, v1259, v1513); + float32x2_t v1273 = vfma_f32(v1264, v1270, v1380); + float32x2_t v1342 = vrev64_f32(v1336); + float32x2_t v1344 = vadd_f32(v1034, v1335); + float32x2_t v1345 = vsub_f32(v1034, v1335); + float32x2_t v1375 = vmul_f32(v1142, v1374); + float32x2_t v1381 = vrev64_f32(v1142); + float32x2_t v1386 = vmul_f32(v1211, v1385); + float32x2_t v1392 = vrev64_f32(v1211); + float32x2_t v1465 = vmul_f32(v1464, v1646); + int16x4_t v1472 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1466, 15), (int32x2_t){0, 0})); + int16x4_t v1484 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1467, 15), (int32x2_t){0, 0})); + float32x2_t v1516 = vfma_f32(v1497, v1503, v1502); + float32x2_t v1517 = vfma_f32(v1508, v1514, v1513); + float32x2_t v1586 = vrev64_f32(v1580); + float32x2_t v1619 = vmul_f32(v1143, v1618); + float32x2_t v1625 = vrev64_f32(v1143); + float32x2_t v1630 = vmul_f32(v1212, v1629); + float32x2_t v1636 = vrev64_f32(v1212); + float32x2_t v999 = vsub_f32(v918, v996); + float32x2_t v1000 = vadd_f32(v918, v996); + float32x2_t v1073 = vsub_f32(v920, v1070); + float32x2_t v1074 = vadd_f32(v920, v1070); + float32x2_t v1274 = vadd_f32(v1272, v1273); + float32x2_t v1275 = vsub_f32(v1273, v1272); + float32x2_t v1343 = vmul_f32(v1342, v1646); + int16x4_t v1350 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1344, 15), (int32x2_t){0, 0})); + int16x4_t v1362 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1345, 15), (int32x2_t){0, 0})); + float32x2_t v1468 = vsub_f32(v963, v1465); + float32x2_t v1469 = vadd_f32(v963, v1465); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v1472), 0); + v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v1484), 0); + float32x2_t v1518 = vadd_f32(v1516, v1517); + float32x2_t v1519 = vsub_f32(v1517, v1516); + float32x2_t v1587 = vmul_f32(v1586, v1646); + float32x2_t v1588 = vadd_f32(v1036, v1579); + float32x2_t v1589 = vsub_f32(v1036, v1579); + float32x2_t v1281 = vrev64_f32(v1275); + float32x2_t v1283 = vadd_f32(v997, v1274); + float32x2_t v1284 = vsub_f32(v997, v1274); + float32x2_t v1346 = vsub_f32(v1035, v1343); + float32x2_t v1347 = vadd_f32(v1035, v1343); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v1350), 0); + v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v1362), 0); + float32x2_t v1394 = vfma_f32(v1375, v1381, v1380); + float32x2_t v1395 = vfma_f32(v1386, v1392, v1624); + int16x4_t v1478 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1468, 15), (int32x2_t){0, 0})); + int16x4_t v1490 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1469, 15), (int32x2_t){0, 0})); + float32x2_t v1525 = vrev64_f32(v1519); + float32x2_t v1527 = vadd_f32(v999, v1518); + float32x2_t v1528 = vsub_f32(v999, v1518); + float32x2_t v1590 = vsub_f32(v1037, v1587); + float32x2_t v1591 = vadd_f32(v1037, v1587); + int16x4_t v1594 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1588, 15), (int32x2_t){0, 0})); + int16x4_t v1606 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1589, 15), (int32x2_t){0, 0})); + float32x2_t v1638 = vfma_f32(v1619, v1625, v1624); + float32x2_t v1639 = vfma_f32(v1630, v1636, v1635); + float32x2_t v1282 = vmul_f32(v1281, v1646); + int16x4_t v1289 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1283, 15), (int32x2_t){0, 0})); + int16x4_t v1301 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1284, 15), (int32x2_t){0, 0})); + int16x4_t v1356 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1346, 15), (int32x2_t){0, 0})); + int16x4_t v1368 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1347, 15), (int32x2_t){0, 0})); + float32x2_t v1396 = vadd_f32(v1394, v1395); + float32x2_t v1397 = vsub_f32(v1395, v1394); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v1478), 0); + v6[ostride * 28] = vget_lane_s32(vreinterpret_s32_s16(v1490), 0); + float32x2_t v1526 = vmul_f32(v1525, v1646); + int16x4_t v1533 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1527, 15), (int32x2_t){0, 0})); + int16x4_t v1545 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1528, 15), (int32x2_t){0, 0})); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v1594), 0); + int16x4_t v1600 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1590, 15), (int32x2_t){0, 0})); + v6[ostride * 22] = vget_lane_s32(vreinterpret_s32_s16(v1606), 0); + int16x4_t v1612 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1591, 15), (int32x2_t){0, 0})); + float32x2_t v1640 = vadd_f32(v1638, v1639); + float32x2_t v1641 = vsub_f32(v1639, v1638); + float32x2_t v1285 = vsub_f32(v998, v1282); + float32x2_t v1286 = vadd_f32(v998, v1282); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v1289), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v1301), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v1356), 0); + v6[ostride * 26] = vget_lane_s32(vreinterpret_s32_s16(v1368), 0); + float32x2_t v1403 = vrev64_f32(v1397); + float32x2_t v1405 = vadd_f32(v1071, v1396); + float32x2_t v1406 = vsub_f32(v1071, v1396); + float32x2_t v1529 = vsub_f32(v1000, v1526); + float32x2_t v1530 = vadd_f32(v1000, v1526); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v1533), 0); + v6[ostride * 21] = vget_lane_s32(vreinterpret_s32_s16(v1545), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v1600), 0); + v6[ostride * 30] = vget_lane_s32(vreinterpret_s32_s16(v1612), 0); + float32x2_t v1647 = vrev64_f32(v1641); + float32x2_t v1649 = vadd_f32(v1073, v1640); + float32x2_t v1650 = vsub_f32(v1073, v1640); + int16x4_t v1295 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1285, 15), (int32x2_t){0, 0})); + int16x4_t v1307 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1286, 15), (int32x2_t){0, 0})); + float32x2_t v1404 = vmul_f32(v1403, v1646); + int16x4_t v1411 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1405, 15), (int32x2_t){0, 0})); + int16x4_t v1423 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1406, 15), (int32x2_t){0, 0})); + int16x4_t v1539 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1529, 15), (int32x2_t){0, 0})); + int16x4_t v1551 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1530, 15), (int32x2_t){0, 0})); + float32x2_t v1648 = vmul_f32(v1647, v1646); + int16x4_t v1655 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1649, 15), (int32x2_t){0, 0})); + int16x4_t v1667 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1650, 15), (int32x2_t){0, 0})); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v1295), 0); + v6[ostride * 25] = vget_lane_s32(vreinterpret_s32_s16(v1307), 0); + float32x2_t v1407 = vsub_f32(v1072, v1404); + float32x2_t v1408 = vadd_f32(v1072, v1404); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v1411), 0); + v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v1423), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v1539), 0); + v6[ostride * 29] = vget_lane_s32(vreinterpret_s32_s16(v1551), 0); + float32x2_t v1651 = vsub_f32(v1074, v1648); + float32x2_t v1652 = vadd_f32(v1074, v1648); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v1655), 0); + v6[ostride * 23] = vget_lane_s32(vreinterpret_s32_s16(v1667), 0); + int16x4_t v1417 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1407, 15), (int32x2_t){0, 0})); + int16x4_t v1429 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1408, 15), (int32x2_t){0, 0})); + int16x4_t v1661 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1651, 15), (int32x2_t){0, 0})); + int16x4_t v1673 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1652, 15), (int32x2_t){0, 0})); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v1417), 0); + v6[ostride * 27] = vget_lane_s32(vreinterpret_s32_s16(v1429), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v1661), 0); + v6[ostride * 31] = vget_lane_s32(vreinterpret_s32_s16(v1673), 0); + v5 += 1 * idist; + v6 += 1 * odist; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cs16_ab_t_gs32(const armral_cmplx_f32_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, + const armral_cmplx_f32_t *restrict w, + int howmany, int idist, int odist, + float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + int64_t v3 = odist; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + int32_t *v6 = (int32_t *)y; + const float32x2_t *v7 = (const float32x2_t *)w; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * v3; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + int64_t v13 = j; + float v1179 = -1.9509032201612819e-01F; + float v1238 = 7.0710678118654757e-01F; + float v1250 = -7.0710678118654746e-01F; + float v1255 = -1.0000000000000000e+00F; + float v1309 = 5.5557023301960229e-01F; + float v1314 = 8.3146961230254524e-01F; + float v1321 = -9.8078528040323043e-01F; + float v1380 = 3.8268343236508984e-01F; + float v1385 = 9.2387953251128674e-01F; + float v1392 = -9.2387953251128685e-01F; + float v1397 = -3.8268343236508967e-01F; + float v1451 = 1.9509032201612833e-01F; + float v1456 = 9.8078528040323043e-01F; + float v1463 = -5.5557023301960218e-01F; + float v1468 = -8.3146961230254524e-01F; + const float32x2_t *v1662 = &v5[v0]; + int32_t *v1900 = &v6[v2]; + int64_t v19 = v0 * 16; + int64_t v34 = v10 * 15; + int64_t v40 = v0 * 8; + int64_t v48 = v10 * 7; + int64_t v54 = v0 * 24; + int64_t v62 = v10 * 23; + int64_t v68 = v0 * 4; + int64_t v82 = v0 * 20; + int64_t v97 = v10 * 3; + int64_t v104 = v10 * 19; + int64_t v110 = v0 * 12; + int64_t v124 = v0 * 28; + int64_t v139 = v10 * 11; + int64_t v146 = v10 * 27; + int64_t v152 = v0 * 2; + int64_t v166 = v0 * 18; + int64_t v188 = v10 * 17; + int64_t v194 = v0 * 10; + int64_t v202 = v10 * 9; + int64_t v208 = v0 * 26; + int64_t v216 = v10 * 25; + int64_t v222 = v0 * 6; + int64_t v236 = v0 * 22; + int64_t v251 = v10 * 5; + int64_t v258 = v10 * 21; + int64_t v264 = v0 * 14; + int64_t v272 = v10 * 13; + int64_t v278 = v0 * 30; + int64_t v286 = v10 * 29; + int64_t v306 = v0 * 17; + int64_t v328 = v10 * 16; + int64_t v334 = v0 * 9; + int64_t v342 = v10 * 8; + int64_t v348 = v0 * 25; + int64_t v356 = v10 * 24; + int64_t v362 = v0 * 5; + int64_t v376 = v0 * 21; + int64_t v391 = v10 * 4; + int64_t v398 = v10 * 20; + int64_t v404 = v0 * 13; + int64_t v418 = v0 * 29; + int64_t v433 = v10 * 12; + int64_t v440 = v10 * 28; + int64_t v446 = v0 * 3; + int64_t v460 = v0 * 19; + int64_t v475 = v10 * 2; + int64_t v482 = v10 * 18; + int64_t v488 = v0 * 11; + int64_t v496 = v10 * 10; + int64_t v502 = v0 * 27; + int64_t v510 = v10 * 26; + int64_t v516 = v0 * 7; + int64_t v530 = v0 * 23; + int64_t v545 = v10 * 6; + int64_t v552 = v10 * 22; + int64_t v558 = v0 * 15; + int64_t v572 = v0 * 31; + int64_t v587 = v10 * 14; + int64_t v594 = v10 * 30; + int64_t v595 = v13 * 31; + int64_t v1001 = v2 * 8; + int64_t v1009 = v2 * 16; + int64_t v1017 = v2 * 24; + int64_t v1072 = v2 * 9; + int64_t v1080 = v2 * 17; + int64_t v1088 = v2 * 25; + float v1104 = v4 * v1380; + int64_t v1135 = v2 * 2; + int64_t v1143 = v2 * 10; + int64_t v1151 = v2 * 18; + int64_t v1159 = v2 * 26; + float v1175 = v4 * v1309; + int64_t v1206 = v2 * 3; + int64_t v1214 = v2 * 11; + int64_t v1222 = v2 * 19; + int64_t v1230 = v2 * 27; + float v1258 = v4 * v1255; + int64_t v1277 = v2 * 4; + int64_t v1285 = v2 * 12; + int64_t v1293 = v2 * 20; + int64_t v1301 = v2 * 28; + float v1317 = v4 * v1314; + float v1329 = v4 * v1451; + int64_t v1348 = v2 * 5; + int64_t v1356 = v2 * 13; + int64_t v1364 = v2 * 21; + int64_t v1372 = v2 * 29; + float v1388 = v4 * v1385; + float v1400 = v4 * v1397; + int64_t v1419 = v2 * 6; + int64_t v1427 = v2 * 14; + int64_t v1435 = v2 * 22; + int64_t v1443 = v2 * 30; + float v1459 = v4 * v1456; + float v1471 = v4 * v1468; + int64_t v1490 = v2 * 7; + int64_t v1498 = v2 * 15; + int64_t v1506 = v2 * 23; + int64_t v1514 = v2 * 31; + const float32x2_t *v1809 = &v5[0]; + svint64_t v1810 = svindex_s64(0, v1); + int32_t *v1859 = &v6[0]; + svfloat32_t v1889 = svdup_n_f32(v1456); + svfloat32_t v1930 = svdup_n_f32(v1385); + svfloat32_t v1971 = svdup_n_f32(v1314); + svfloat32_t v1973 = svdup_n_f32(v1179); + svfloat32_t v2012 = svdup_n_f32(v1238); + svfloat32_t v2014 = svdup_n_f32(v1250); + svfloat32_t v2053 = svdup_n_f32(v1309); + svfloat32_t v2055 = svdup_n_f32(v1321); + svfloat32_t v2094 = svdup_n_f32(v1380); + svfloat32_t v2096 = svdup_n_f32(v1392); + svfloat32_t v2135 = svdup_n_f32(v1451); + svfloat32_t v2137 = svdup_n_f32(v1463); + svfloat32_t v2139 = svdup_n_f32(v4); + svint64_t v2174 = svindex_s64(0, v3); + int64_t v36 = v34 + v595; + int64_t v50 = v48 + v595; + int64_t v64 = v62 + v595; + int64_t v99 = v97 + v595; + int64_t v106 = v104 + v595; + int64_t v141 = v139 + v595; + int64_t v148 = v146 + v595; + int64_t v183 = v10 + v595; + int64_t v190 = v188 + v595; + int64_t v204 = v202 + v595; + int64_t v218 = v216 + v595; + int64_t v253 = v251 + v595; + int64_t v260 = v258 + v595; + int64_t v274 = v272 + v595; + int64_t v288 = v286 + v595; + svfloat32_t v324 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v595])); + int64_t v330 = v328 + v595; + int64_t v344 = v342 + v595; + int64_t v358 = v356 + v595; + int64_t v393 = v391 + v595; + int64_t v400 = v398 + v595; + int64_t v435 = v433 + v595; + int64_t v442 = v440 + v595; + int64_t v477 = v475 + v595; + int64_t v484 = v482 + v595; + int64_t v498 = v496 + v595; + int64_t v512 = v510 + v595; + int64_t v547 = v545 + v595; + int64_t v554 = v552 + v595; + int64_t v589 = v587 + v595; + int64_t v596 = v594 + v595; + const float32x2_t *v1527 = &v5[v19]; + const float32x2_t *v1536 = &v5[v40]; + const float32x2_t *v1545 = &v5[v54]; + const float32x2_t *v1554 = &v5[v68]; + const float32x2_t *v1563 = &v5[v82]; + const float32x2_t *v1572 = &v5[v110]; + const float32x2_t *v1581 = &v5[v124]; + const float32x2_t *v1590 = &v5[v152]; + const float32x2_t *v1599 = &v5[v166]; + const float32x2_t *v1608 = &v5[v194]; + const float32x2_t *v1617 = &v5[v208]; + const float32x2_t *v1626 = &v5[v222]; + const float32x2_t *v1635 = &v5[v236]; + const float32x2_t *v1644 = &v5[v264]; + const float32x2_t *v1653 = &v5[v278]; + svfloat32_t v1664 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1662), v1810)); + const float32x2_t *v1672 = &v5[v306]; + const float32x2_t *v1682 = &v5[v334]; + const float32x2_t *v1691 = &v5[v348]; + const float32x2_t *v1700 = &v5[v362]; + const float32x2_t *v1709 = &v5[v376]; + const float32x2_t *v1718 = &v5[v404]; + const float32x2_t *v1727 = &v5[v418]; + const float32x2_t *v1736 = &v5[v446]; + const float32x2_t *v1745 = &v5[v460]; + const float32x2_t *v1754 = &v5[v488]; + const float32x2_t *v1763 = &v5[v502]; + const float32x2_t *v1772 = &v5[v516]; + const float32x2_t *v1781 = &v5[v530]; + const float32x2_t *v1790 = &v5[v558]; + const float32x2_t *v1799 = &v5[v572]; + svfloat32_t v1811 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1809), v1810)); + int32_t *v1868 = &v6[v1001]; + int32_t *v1877 = &v6[v1009]; + int32_t *v1886 = &v6[v1017]; + int32_t *v1909 = &v6[v1072]; + int32_t *v1918 = &v6[v1080]; + int32_t *v1927 = &v6[v1088]; + svfloat32_t v1931 = svdup_n_f32(v1104); + int32_t *v1941 = &v6[v1135]; + int32_t *v1950 = &v6[v1143]; + int32_t *v1959 = &v6[v1151]; + int32_t *v1968 = &v6[v1159]; + svfloat32_t v1972 = svdup_n_f32(v1175); + int32_t *v1982 = &v6[v1206]; + int32_t *v1991 = &v6[v1214]; + int32_t *v2000 = &v6[v1222]; + int32_t *v2009 = &v6[v1230]; + svfloat32_t v2015 = svdup_n_f32(v1258); + int32_t *v2023 = &v6[v1277]; + int32_t *v2032 = &v6[v1285]; + int32_t *v2041 = &v6[v1293]; + int32_t *v2050 = &v6[v1301]; + svfloat32_t v2054 = svdup_n_f32(v1317); + svfloat32_t v2056 = svdup_n_f32(v1329); + int32_t *v2064 = &v6[v1348]; + int32_t *v2073 = &v6[v1356]; + int32_t *v2082 = &v6[v1364]; + int32_t *v2091 = &v6[v1372]; + svfloat32_t v2095 = svdup_n_f32(v1388); + svfloat32_t v2097 = svdup_n_f32(v1400); + int32_t *v2105 = &v6[v1419]; + int32_t *v2114 = &v6[v1427]; + int32_t *v2123 = &v6[v1435]; + int32_t *v2132 = &v6[v1443]; + svfloat32_t v2136 = svdup_n_f32(v1459); + svfloat32_t v2138 = svdup_n_f32(v1471); + int32_t *v2146 = &v6[v1490]; + int32_t *v2155 = &v6[v1498]; + int32_t *v2164 = &v6[v1506]; + int32_t *v2173 = &v6[v1514]; + svfloat32_t v37 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v36])); + svfloat32_t v51 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v50])); + svfloat32_t v65 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v64])); + svfloat32_t v100 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v99])); + svfloat32_t v107 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v106])); + svfloat32_t v142 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v141])); + svfloat32_t v149 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v148])); + svfloat32_t v184 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v183])); + svfloat32_t v191 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v190])); + svfloat32_t v205 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v204])); + svfloat32_t v219 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v218])); + svfloat32_t v254 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v253])); + svfloat32_t v261 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v260])); + svfloat32_t v275 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v274])); + svfloat32_t v289 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v288])); + svfloat32_t zero325 = svdup_n_f32(0); + svfloat32_t v325 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero325, v1664, v324, 0), v1664, + v324, 90); + svfloat32_t v331 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v330])); + svfloat32_t v345 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v344])); + svfloat32_t v359 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v358])); + svfloat32_t v394 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v393])); + svfloat32_t v401 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v400])); + svfloat32_t v436 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v435])); + svfloat32_t v443 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v442])); + svfloat32_t v478 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v477])); + svfloat32_t v485 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v484])); + svfloat32_t v499 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v498])); + svfloat32_t v513 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v512])); + svfloat32_t v548 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v547])); + svfloat32_t v555 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v554])); + svfloat32_t v590 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v589])); + svfloat32_t v597 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v7)[v596])); + svfloat32_t v1529 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1527), v1810)); + svfloat32_t v1538 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1536), v1810)); + svfloat32_t v1547 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1545), v1810)); + svfloat32_t v1556 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1554), v1810)); + svfloat32_t v1565 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1563), v1810)); + svfloat32_t v1574 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1572), v1810)); + svfloat32_t v1583 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1581), v1810)); + svfloat32_t v1592 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1590), v1810)); + svfloat32_t v1601 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1599), v1810)); + svfloat32_t v1610 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1608), v1810)); + svfloat32_t v1619 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1617), v1810)); + svfloat32_t v1628 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1626), v1810)); + svfloat32_t v1637 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1635), v1810)); + svfloat32_t v1646 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1644), v1810)); + svfloat32_t v1655 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1653), v1810)); + svfloat32_t v1674 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1672), v1810)); + svfloat32_t v1684 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1682), v1810)); + svfloat32_t v1693 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1691), v1810)); + svfloat32_t v1702 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1700), v1810)); + svfloat32_t v1711 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1709), v1810)); + svfloat32_t v1720 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1718), v1810)); + svfloat32_t v1729 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1727), v1810)); + svfloat32_t v1738 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1736), v1810)); + svfloat32_t v1747 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1745), v1810)); + svfloat32_t v1756 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1754), v1810)); + svfloat32_t v1765 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1763), v1810)); + svfloat32_t v1774 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1772), v1810)); + svfloat32_t v1783 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1781), v1810)); + svfloat32_t v1792 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1790), v1810)); + svfloat32_t v1801 = svreinterpret_f32_f64( + svld1_gather_s64index_f64(pred_full, (const double *)(v1799), v1810)); + svfloat32_t zero38 = svdup_n_f32(0); + svfloat32_t v38 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v1529, v37, 0), + v1529, v37, 90); + svfloat32_t zero52 = svdup_n_f32(0); + svfloat32_t v52 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v1538, v51, 0), + v1538, v51, 90); + svfloat32_t zero66 = svdup_n_f32(0); + svfloat32_t v66 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero66, v1547, v65, 0), + v1547, v65, 90); + svfloat32_t zero101 = svdup_n_f32(0); + svfloat32_t v101 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero101, v1556, v100, 0), v1556, + v100, 90); + svfloat32_t zero108 = svdup_n_f32(0); + svfloat32_t v108 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero108, v1565, v107, 0), v1565, + v107, 90); + svfloat32_t zero143 = svdup_n_f32(0); + svfloat32_t v143 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero143, v1574, v142, 0), v1574, + v142, 90); + svfloat32_t zero150 = svdup_n_f32(0); + svfloat32_t v150 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero150, v1583, v149, 0), v1583, + v149, 90); + svfloat32_t zero185 = svdup_n_f32(0); + svfloat32_t v185 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero185, v1592, v184, 0), v1592, + v184, 90); + svfloat32_t zero192 = svdup_n_f32(0); + svfloat32_t v192 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero192, v1601, v191, 0), v1601, + v191, 90); + svfloat32_t zero206 = svdup_n_f32(0); + svfloat32_t v206 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero206, v1610, v205, 0), v1610, + v205, 90); + svfloat32_t zero220 = svdup_n_f32(0); + svfloat32_t v220 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero220, v1619, v219, 0), v1619, + v219, 90); + svfloat32_t zero255 = svdup_n_f32(0); + svfloat32_t v255 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero255, v1628, v254, 0), v1628, + v254, 90); + svfloat32_t zero262 = svdup_n_f32(0); + svfloat32_t v262 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero262, v1637, v261, 0), v1637, + v261, 90); + svfloat32_t zero276 = svdup_n_f32(0); + svfloat32_t v276 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero276, v1646, v275, 0), v1646, + v275, 90); + svfloat32_t zero290 = svdup_n_f32(0); + svfloat32_t v290 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero290, v1655, v289, 0), v1655, + v289, 90); + svfloat32_t zero332 = svdup_n_f32(0); + svfloat32_t v332 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero332, v1674, v331, 0), v1674, + v331, 90); + svfloat32_t zero346 = svdup_n_f32(0); + svfloat32_t v346 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero346, v1684, v345, 0), v1684, + v345, 90); + svfloat32_t zero360 = svdup_n_f32(0); + svfloat32_t v360 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero360, v1693, v359, 0), v1693, + v359, 90); + svfloat32_t zero395 = svdup_n_f32(0); + svfloat32_t v395 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero395, v1702, v394, 0), v1702, + v394, 90); + svfloat32_t zero402 = svdup_n_f32(0); + svfloat32_t v402 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero402, v1711, v401, 0), v1711, + v401, 90); + svfloat32_t zero437 = svdup_n_f32(0); + svfloat32_t v437 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero437, v1720, v436, 0), v1720, + v436, 90); + svfloat32_t zero444 = svdup_n_f32(0); + svfloat32_t v444 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero444, v1729, v443, 0), v1729, + v443, 90); + svfloat32_t zero479 = svdup_n_f32(0); + svfloat32_t v479 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero479, v1738, v478, 0), v1738, + v478, 90); + svfloat32_t zero486 = svdup_n_f32(0); + svfloat32_t v486 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero486, v1747, v485, 0), v1747, + v485, 90); + svfloat32_t zero500 = svdup_n_f32(0); + svfloat32_t v500 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero500, v1756, v499, 0), v1756, + v499, 90); + svfloat32_t zero514 = svdup_n_f32(0); + svfloat32_t v514 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero514, v1765, v513, 0), v1765, + v513, 90); + svfloat32_t zero549 = svdup_n_f32(0); + svfloat32_t v549 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero549, v1774, v548, 0), v1774, + v548, 90); + svfloat32_t zero556 = svdup_n_f32(0); + svfloat32_t v556 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero556, v1783, v555, 0), v1783, + v555, 90); + svfloat32_t zero591 = svdup_n_f32(0); + svfloat32_t v591 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero591, v1792, v590, 0), v1792, + v590, 90); + svfloat32_t zero598 = svdup_n_f32(0); + svfloat32_t v598 = svcmla_f32_x( + pred_full, svcmla_f32_x(pred_full, zero598, v1801, v597, 0), v1801, + v597, 90); + svfloat32_t v606 = svadd_f32_x(svptrue_b32(), v1811, v38); + svfloat32_t v607 = svsub_f32_x(svptrue_b32(), v1811, v38); + svfloat32_t v608 = svadd_f32_x(svptrue_b32(), v52, v66); + svfloat32_t v609 = svsub_f32_x(svptrue_b32(), v52, v66); + svfloat32_t v621 = svadd_f32_x(svptrue_b32(), v101, v108); + svfloat32_t v622 = svsub_f32_x(svptrue_b32(), v101, v108); + svfloat32_t v623 = svadd_f32_x(svptrue_b32(), v143, v150); + svfloat32_t v624 = svsub_f32_x(svptrue_b32(), v143, v150); + svfloat32_t v677 = svadd_f32_x(svptrue_b32(), v185, v192); + svfloat32_t v678 = svsub_f32_x(svptrue_b32(), v185, v192); + svfloat32_t v679 = svadd_f32_x(svptrue_b32(), v206, v220); + svfloat32_t v680 = svsub_f32_x(svptrue_b32(), v206, v220); + svfloat32_t v692 = svadd_f32_x(svptrue_b32(), v255, v262); + svfloat32_t v693 = svsub_f32_x(svptrue_b32(), v255, v262); + svfloat32_t v694 = svadd_f32_x(svptrue_b32(), v276, v290); + svfloat32_t v695 = svsub_f32_x(svptrue_b32(), v276, v290); + svfloat32_t v837 = svadd_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v838 = svsub_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v839 = svadd_f32_x(svptrue_b32(), v346, v360); + svfloat32_t v840 = svsub_f32_x(svptrue_b32(), v346, v360); + svfloat32_t v852 = svadd_f32_x(svptrue_b32(), v395, v402); + svfloat32_t v853 = svsub_f32_x(svptrue_b32(), v395, v402); + svfloat32_t v854 = svadd_f32_x(svptrue_b32(), v437, v444); + svfloat32_t v855 = svsub_f32_x(svptrue_b32(), v437, v444); + svfloat32_t v908 = svadd_f32_x(svptrue_b32(), v479, v486); + svfloat32_t v909 = svsub_f32_x(svptrue_b32(), v479, v486); + svfloat32_t v910 = svadd_f32_x(svptrue_b32(), v500, v514); + svfloat32_t v911 = svsub_f32_x(svptrue_b32(), v500, v514); + svfloat32_t v923 = svadd_f32_x(svptrue_b32(), v549, v556); + svfloat32_t v924 = svsub_f32_x(svptrue_b32(), v549, v556); + svfloat32_t v925 = svadd_f32_x(svptrue_b32(), v591, v598); + svfloat32_t v926 = svsub_f32_x(svptrue_b32(), v591, v598); + svfloat32_t zero616 = svdup_n_f32(0); + svfloat32_t v616 = svcmla_f32_x(pred_full, zero616, v2015, v609, 90); + svfloat32_t v617 = svadd_f32_x(svptrue_b32(), v606, v608); + svfloat32_t v618 = svsub_f32_x(svptrue_b32(), v606, v608); + svfloat32_t v625 = svadd_f32_x(svptrue_b32(), v621, v623); + svfloat32_t v626 = svsub_f32_x(svptrue_b32(), v621, v623); + svfloat32_t v642 = svmul_f32_x(svptrue_b32(), v622, v2012); + svfloat32_t v654 = svmul_f32_x(svptrue_b32(), v624, v2014); + svfloat32_t zero687 = svdup_n_f32(0); + svfloat32_t v687 = svcmla_f32_x(pred_full, zero687, v2015, v680, 90); + svfloat32_t v688 = svadd_f32_x(svptrue_b32(), v677, v679); + svfloat32_t v689 = svsub_f32_x(svptrue_b32(), v677, v679); + svfloat32_t zero702 = svdup_n_f32(0); + svfloat32_t v702 = svcmla_f32_x(pred_full, zero702, v2015, v695, 90); + svfloat32_t v703 = svadd_f32_x(svptrue_b32(), v692, v694); + svfloat32_t v704 = svsub_f32_x(svptrue_b32(), v692, v694); + svfloat32_t zero847 = svdup_n_f32(0); + svfloat32_t v847 = svcmla_f32_x(pred_full, zero847, v2015, v840, 90); + svfloat32_t v848 = svadd_f32_x(svptrue_b32(), v837, v839); + svfloat32_t v849 = svsub_f32_x(svptrue_b32(), v837, v839); + svfloat32_t v856 = svadd_f32_x(svptrue_b32(), v852, v854); + svfloat32_t v857 = svsub_f32_x(svptrue_b32(), v852, v854); + svfloat32_t v873 = svmul_f32_x(svptrue_b32(), v853, v2012); + svfloat32_t v885 = svmul_f32_x(svptrue_b32(), v855, v2014); + svfloat32_t zero918 = svdup_n_f32(0); + svfloat32_t v918 = svcmla_f32_x(pred_full, zero918, v2015, v911, 90); + svfloat32_t v919 = svadd_f32_x(svptrue_b32(), v908, v910); + svfloat32_t v920 = svsub_f32_x(svptrue_b32(), v908, v910); + svfloat32_t v927 = svadd_f32_x(svptrue_b32(), v923, v925); + svfloat32_t v928 = svsub_f32_x(svptrue_b32(), v923, v925); + svfloat32_t v944 = svmul_f32_x(svptrue_b32(), v924, v2012); + svfloat32_t v956 = svmul_f32_x(svptrue_b32(), v926, v2014); + svfloat32_t v619 = svsub_f32_x(svptrue_b32(), v607, v616); + svfloat32_t v620 = svadd_f32_x(svptrue_b32(), v607, v616); + svfloat32_t zero633 = svdup_n_f32(0); + svfloat32_t v633 = svcmla_f32_x(pred_full, zero633, v2015, v626, 90); + svfloat32_t v634 = svadd_f32_x(svptrue_b32(), v617, v625); + svfloat32_t v635 = svsub_f32_x(svptrue_b32(), v617, v625); + svfloat32_t v690 = svsub_f32_x(svptrue_b32(), v678, v687); + svfloat32_t v691 = svadd_f32_x(svptrue_b32(), v678, v687); + svfloat32_t v705 = svsub_f32_x(svptrue_b32(), v693, v702); + svfloat32_t v706 = svadd_f32_x(svptrue_b32(), v693, v702); + svfloat32_t v707 = svadd_f32_x(svptrue_b32(), v688, v703); + svfloat32_t v708 = svsub_f32_x(svptrue_b32(), v688, v703); + svfloat32_t v763 = svmul_f32_x(svptrue_b32(), v689, v2012); + svfloat32_t v775 = svmul_f32_x(svptrue_b32(), v704, v2014); + svfloat32_t v850 = svsub_f32_x(svptrue_b32(), v838, v847); + svfloat32_t v851 = svadd_f32_x(svptrue_b32(), v838, v847); + svfloat32_t zero864 = svdup_n_f32(0); + svfloat32_t v864 = svcmla_f32_x(pred_full, zero864, v2015, v857, 90); + svfloat32_t v865 = svadd_f32_x(svptrue_b32(), v848, v856); + svfloat32_t v866 = svsub_f32_x(svptrue_b32(), v848, v856); + svfloat32_t v921 = svsub_f32_x(svptrue_b32(), v909, v918); + svfloat32_t v922 = svadd_f32_x(svptrue_b32(), v909, v918); + svfloat32_t zero935 = svdup_n_f32(0); + svfloat32_t v935 = svcmla_f32_x(pred_full, zero935, v2015, v928, 90); + svfloat32_t v936 = svadd_f32_x(svptrue_b32(), v919, v927); + svfloat32_t v937 = svsub_f32_x(svptrue_b32(), v919, v927); + svfloat32_t v636 = svsub_f32_x(svptrue_b32(), v618, v633); + svfloat32_t v637 = svadd_f32_x(svptrue_b32(), v618, v633); + svfloat32_t v662 = svcmla_f32_x(pred_full, v642, v2139, v642, 90); + svfloat32_t v663 = svcmla_f32_x(pred_full, v654, v2015, v654, 90); + svfloat32_t zero715 = svdup_n_f32(0); + svfloat32_t v715 = svcmla_f32_x(pred_full, zero715, v2015, v708, 90); + svfloat32_t v716 = svadd_f32_x(svptrue_b32(), v634, v707); + svfloat32_t v717 = svsub_f32_x(svptrue_b32(), v634, v707); + svfloat32_t v724 = svmul_f32_x(svptrue_b32(), v690, v1930); + svfloat32_t v736 = svmul_f32_x(svptrue_b32(), v705, v2094); + svfloat32_t v802 = svmul_f32_x(svptrue_b32(), v691, v2094); + svfloat32_t v814 = svmul_f32_x(svptrue_b32(), v706, v2096); + svfloat32_t v867 = svsub_f32_x(svptrue_b32(), v849, v864); + svfloat32_t v868 = svadd_f32_x(svptrue_b32(), v849, v864); + svfloat32_t v893 = svcmla_f32_x(pred_full, v873, v2139, v873, 90); + svfloat32_t v894 = svcmla_f32_x(pred_full, v885, v2015, v885, 90); + svfloat32_t v938 = svsub_f32_x(svptrue_b32(), v920, v935); + svfloat32_t v939 = svadd_f32_x(svptrue_b32(), v920, v935); + svfloat32_t v964 = svcmla_f32_x(pred_full, v944, v2139, v944, 90); + svfloat32_t v965 = svcmla_f32_x(pred_full, v956, v2015, v956, 90); + svfloat32_t v979 = svadd_f32_x(svptrue_b32(), v865, v936); + svfloat32_t v980 = svsub_f32_x(svptrue_b32(), v865, v936); + svfloat32_t v1241 = svmul_f32_x(svptrue_b32(), v866, v2012); + svfloat32_t v1253 = svmul_f32_x(svptrue_b32(), v937, v2014); + svfloat32_t v664 = svadd_f32_x(svptrue_b32(), v662, v663); + svfloat32_t v665 = svsub_f32_x(svptrue_b32(), v663, v662); + svfloat32_t v718 = svsub_f32_x(svptrue_b32(), v635, v715); + svfloat32_t v719 = svadd_f32_x(svptrue_b32(), v635, v715); + svfloat32_t v744 = svcmla_f32_x(pred_full, v724, v1931, v690, 90); + svfloat32_t v745 = svcmla_f32_x(pred_full, v736, v2095, v705, 90); + svfloat32_t v783 = svcmla_f32_x(pred_full, v763, v2139, v763, 90); + svfloat32_t v784 = svcmla_f32_x(pred_full, v775, v2015, v775, 90); + svfloat32_t v822 = svcmla_f32_x(pred_full, v802, v2095, v691, 90); + svfloat32_t v823 = svcmla_f32_x(pred_full, v814, v2097, v706, 90); + svfloat32_t v895 = svadd_f32_x(svptrue_b32(), v893, v894); + svfloat32_t v896 = svsub_f32_x(svptrue_b32(), v894, v893); + svfloat32_t v966 = svadd_f32_x(svptrue_b32(), v964, v965); + svfloat32_t v967 = svsub_f32_x(svptrue_b32(), v965, v964); + svfloat32_t zero987 = svdup_n_f32(0); + svfloat32_t v987 = svcmla_f32_x(pred_full, zero987, v2015, v980, 90); + svfloat32_t v988 = svadd_f32_x(svptrue_b32(), v716, v979); + svfloat32_t v989 = svsub_f32_x(svptrue_b32(), v716, v979); + svfloat32_t v1099 = svmul_f32_x(svptrue_b32(), v867, v1930); + svfloat32_t v1111 = svmul_f32_x(svptrue_b32(), v938, v2094); + svfloat32_t v1383 = svmul_f32_x(svptrue_b32(), v868, v2094); + svfloat32_t v1395 = svmul_f32_x(svptrue_b32(), v939, v2096); + svfloat32_t zero672 = svdup_n_f32(0); + svfloat32_t v672 = svcmla_f32_x(pred_full, zero672, v2139, v665, 90); + svfloat32_t v673 = svadd_f32_x(svptrue_b32(), v619, v664); + svfloat32_t v674 = svsub_f32_x(svptrue_b32(), v619, v664); + svfloat32_t v746 = svadd_f32_x(svptrue_b32(), v744, v745); + svfloat32_t v747 = svsub_f32_x(svptrue_b32(), v745, v744); + svfloat32_t v785 = svadd_f32_x(svptrue_b32(), v783, v784); + svfloat32_t v786 = svsub_f32_x(svptrue_b32(), v784, v783); + svfloat32_t v824 = svadd_f32_x(svptrue_b32(), v822, v823); + svfloat32_t v825 = svsub_f32_x(svptrue_b32(), v823, v822); + svfloat32_t zero903 = svdup_n_f32(0); + svfloat32_t v903 = svcmla_f32_x(pred_full, zero903, v2139, v896, 90); + svfloat32_t v904 = svadd_f32_x(svptrue_b32(), v850, v895); + svfloat32_t v905 = svsub_f32_x(svptrue_b32(), v850, v895); + svfloat32_t zero974 = svdup_n_f32(0); + svfloat32_t v974 = svcmla_f32_x(pred_full, zero974, v2139, v967, 90); + svfloat32_t v975 = svadd_f32_x(svptrue_b32(), v921, v966); + svfloat32_t v976 = svsub_f32_x(svptrue_b32(), v921, v966); + svfloat32_t v990 = svsub_f32_x(svptrue_b32(), v717, v987); + svfloat32_t v991 = svadd_f32_x(svptrue_b32(), v717, v987); + svint16_t v994 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v988, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1010 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v989, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v1119 = svcmla_f32_x(pred_full, v1099, v1931, v867, 90); + svfloat32_t v1120 = svcmla_f32_x(pred_full, v1111, v2095, v938, 90); + svfloat32_t v1261 = svcmla_f32_x(pred_full, v1241, v2139, v1241, 90); + svfloat32_t v1262 = svcmla_f32_x(pred_full, v1253, v2015, v1253, 90); + svfloat32_t v1403 = svcmla_f32_x(pred_full, v1383, v2095, v868, 90); + svfloat32_t v1404 = svcmla_f32_x(pred_full, v1395, v2097, v939, 90); + svfloat32_t v675 = svsub_f32_x(svptrue_b32(), v620, v672); + svfloat32_t v676 = svadd_f32_x(svptrue_b32(), v620, v672); + svfloat32_t zero754 = svdup_n_f32(0); + svfloat32_t v754 = svcmla_f32_x(pred_full, zero754, v2139, v747, 90); + svfloat32_t v755 = svadd_f32_x(svptrue_b32(), v673, v746); + svfloat32_t v756 = svsub_f32_x(svptrue_b32(), v673, v746); + svfloat32_t zero793 = svdup_n_f32(0); + svfloat32_t v793 = svcmla_f32_x(pred_full, zero793, v2139, v786, 90); + svfloat32_t v794 = svadd_f32_x(svptrue_b32(), v636, v785); + svfloat32_t v795 = svsub_f32_x(svptrue_b32(), v636, v785); + svfloat32_t zero832 = svdup_n_f32(0); + svfloat32_t v832 = svcmla_f32_x(pred_full, zero832, v2139, v825, 90); + svfloat32_t v906 = svsub_f32_x(svptrue_b32(), v851, v903); + svfloat32_t v907 = svadd_f32_x(svptrue_b32(), v851, v903); + svfloat32_t v977 = svsub_f32_x(svptrue_b32(), v922, v974); + svfloat32_t v978 = svadd_f32_x(svptrue_b32(), v922, v974); + svint16_t v1002 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v990, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1018 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v991, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v1028 = svmul_f32_x(svptrue_b32(), v904, v1889); + svfloat32_t v1040 = svmul_f32_x(svptrue_b32(), v975, v1971); + svfloat32_t v1121 = svadd_f32_x(svptrue_b32(), v1119, v1120); + svfloat32_t v1122 = svsub_f32_x(svptrue_b32(), v1120, v1119); + svfloat32_t v1263 = svadd_f32_x(svptrue_b32(), v1261, v1262); + svfloat32_t v1264 = svsub_f32_x(svptrue_b32(), v1262, v1261); + svfloat32_t v1312 = svmul_f32_x(svptrue_b32(), v905, v2053); + svfloat32_t v1324 = svmul_f32_x(svptrue_b32(), v976, v2055); + svfloat32_t v1405 = svadd_f32_x(svptrue_b32(), v1403, v1404); + svfloat32_t v1406 = svsub_f32_x(svptrue_b32(), v1404, v1403); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1859), v2174, + svreinterpret_u64_s16(v994)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1877), v2174, + svreinterpret_u64_s16(v1010)); + svfloat32_t v757 = svsub_f32_x(svptrue_b32(), v674, v754); + svfloat32_t v758 = svadd_f32_x(svptrue_b32(), v674, v754); + svfloat32_t v796 = svsub_f32_x(svptrue_b32(), v637, v793); + svfloat32_t v797 = svadd_f32_x(svptrue_b32(), v637, v793); + svfloat32_t v833 = svadd_f32_x(svptrue_b32(), v675, v824); + svfloat32_t v834 = svsub_f32_x(svptrue_b32(), v675, v824); + svfloat32_t v835 = svsub_f32_x(svptrue_b32(), v676, v832); + svfloat32_t v836 = svadd_f32_x(svptrue_b32(), v676, v832); + svfloat32_t v1048 = svcmla_f32_x(pred_full, v1028, v2056, v904, 90); + svfloat32_t v1049 = svcmla_f32_x(pred_full, v1040, v1972, v975, 90); + svfloat32_t zero1129 = svdup_n_f32(0); + svfloat32_t v1129 = svcmla_f32_x(pred_full, zero1129, v2139, v1122, 90); + svfloat32_t v1130 = svadd_f32_x(svptrue_b32(), v794, v1121); + svfloat32_t v1131 = svsub_f32_x(svptrue_b32(), v794, v1121); + svfloat32_t v1170 = svmul_f32_x(svptrue_b32(), v906, v1971); + svfloat32_t v1182 = svmul_f32_x(svptrue_b32(), v977, v1973); + svfloat32_t zero1271 = svdup_n_f32(0); + svfloat32_t v1271 = svcmla_f32_x(pred_full, zero1271, v2139, v1264, 90); + svfloat32_t v1272 = svadd_f32_x(svptrue_b32(), v718, v1263); + svfloat32_t v1273 = svsub_f32_x(svptrue_b32(), v718, v1263); + svfloat32_t v1332 = svcmla_f32_x(pred_full, v1312, v2054, v905, 90); + svfloat32_t v1333 = svcmla_f32_x(pred_full, v1324, v2056, v976, 90); + svfloat32_t zero1413 = svdup_n_f32(0); + svfloat32_t v1413 = svcmla_f32_x(pred_full, zero1413, v2139, v1406, 90); + svfloat32_t v1454 = svmul_f32_x(svptrue_b32(), v907, v2135); + svfloat32_t v1466 = svmul_f32_x(svptrue_b32(), v978, v2137); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1868), v2174, + svreinterpret_u64_s16(v1002)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1886), v2174, + svreinterpret_u64_s16(v1018)); + svfloat32_t v1050 = svadd_f32_x(svptrue_b32(), v1048, v1049); + svfloat32_t v1051 = svsub_f32_x(svptrue_b32(), v1049, v1048); + svfloat32_t v1132 = svsub_f32_x(svptrue_b32(), v795, v1129); + svfloat32_t v1133 = svadd_f32_x(svptrue_b32(), v795, v1129); + svint16_t v1136 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1130, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1152 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1131, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1190 = svcmla_f32_x(pred_full, v1170, v1972, v906, 90); + svfloat32_t v1191 = svcmla_f32_x(pred_full, v1182, v2136, v977, 90); + svfloat32_t v1274 = svsub_f32_x(svptrue_b32(), v719, v1271); + svfloat32_t v1275 = svadd_f32_x(svptrue_b32(), v719, v1271); + svint16_t v1278 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1272, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1294 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1273, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1334 = svadd_f32_x(svptrue_b32(), v1332, v1333); + svfloat32_t v1335 = svsub_f32_x(svptrue_b32(), v1333, v1332); + svfloat32_t v1414 = svadd_f32_x(svptrue_b32(), v796, v1405); + svfloat32_t v1415 = svsub_f32_x(svptrue_b32(), v796, v1405); + svfloat32_t v1416 = svsub_f32_x(svptrue_b32(), v797, v1413); + svfloat32_t v1417 = svadd_f32_x(svptrue_b32(), v797, v1413); + svfloat32_t v1474 = svcmla_f32_x(pred_full, v1454, v2136, v907, 90); + svfloat32_t v1475 = svcmla_f32_x(pred_full, v1466, v2138, v978, 90); + svfloat32_t zero1058 = svdup_n_f32(0); + svfloat32_t v1058 = svcmla_f32_x(pred_full, zero1058, v2139, v1051, 90); + svfloat32_t v1059 = svadd_f32_x(svptrue_b32(), v755, v1050); + svfloat32_t v1060 = svsub_f32_x(svptrue_b32(), v755, v1050); + svint16_t v1144 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1132, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1160 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1133, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1192 = svadd_f32_x(svptrue_b32(), v1190, v1191); + svfloat32_t v1193 = svsub_f32_x(svptrue_b32(), v1191, v1190); + svint16_t v1286 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1274, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1302 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1275, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t zero1342 = svdup_n_f32(0); + svfloat32_t v1342 = svcmla_f32_x(pred_full, zero1342, v2139, v1335, 90); + svfloat32_t v1343 = svadd_f32_x(svptrue_b32(), v757, v1334); + svfloat32_t v1344 = svsub_f32_x(svptrue_b32(), v757, v1334); + svint16_t v1420 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1414, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1428 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1416, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1436 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1415, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1444 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1417, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1476 = svadd_f32_x(svptrue_b32(), v1474, v1475); + svfloat32_t v1477 = svsub_f32_x(svptrue_b32(), v1475, v1474); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1941), v2174, + svreinterpret_u64_s16(v1136)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1959), v2174, + svreinterpret_u64_s16(v1152)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2023), v2174, + svreinterpret_u64_s16(v1278)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2041), v2174, + svreinterpret_u64_s16(v1294)); + svfloat32_t v1061 = svsub_f32_x(svptrue_b32(), v756, v1058); + svfloat32_t v1062 = svadd_f32_x(svptrue_b32(), v756, v1058); + svint16_t v1065 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1059, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1081 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1060, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t zero1200 = svdup_n_f32(0); + svfloat32_t v1200 = svcmla_f32_x(pred_full, zero1200, v2139, v1193, 90); + svfloat32_t v1201 = svadd_f32_x(svptrue_b32(), v833, v1192); + svfloat32_t v1202 = svsub_f32_x(svptrue_b32(), v833, v1192); + svfloat32_t v1345 = svsub_f32_x(svptrue_b32(), v758, v1342); + svfloat32_t v1346 = svadd_f32_x(svptrue_b32(), v758, v1342); + svint16_t v1349 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1343, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1365 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1344, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t zero1484 = svdup_n_f32(0); + svfloat32_t v1484 = svcmla_f32_x(pred_full, zero1484, v2139, v1477, 90); + svfloat32_t v1485 = svadd_f32_x(svptrue_b32(), v835, v1476); + svfloat32_t v1486 = svsub_f32_x(svptrue_b32(), v835, v1476); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1950), v2174, + svreinterpret_u64_s16(v1144)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1968), v2174, + svreinterpret_u64_s16(v1160)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2032), v2174, + svreinterpret_u64_s16(v1286)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2050), v2174, + svreinterpret_u64_s16(v1302)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2105), v2174, + svreinterpret_u64_s16(v1420)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2114), v2174, + svreinterpret_u64_s16(v1428)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2123), v2174, + svreinterpret_u64_s16(v1436)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2132), v2174, + svreinterpret_u64_s16(v1444)); + svint16_t v1073 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1061, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1089 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1062, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1203 = svsub_f32_x(svptrue_b32(), v834, v1200); + svfloat32_t v1204 = svadd_f32_x(svptrue_b32(), v834, v1200); + svint16_t v1207 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1201, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1223 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1202, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1357 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1345, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1373 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1346, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1487 = svsub_f32_x(svptrue_b32(), v836, v1484); + svfloat32_t v1488 = svadd_f32_x(svptrue_b32(), v836, v1484); + svint16_t v1491 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1485, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1507 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1486, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1900), v2174, + svreinterpret_u64_s16(v1065)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1918), v2174, + svreinterpret_u64_s16(v1081)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2064), v2174, + svreinterpret_u64_s16(v1349)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2082), v2174, + svreinterpret_u64_s16(v1365)); + svint16_t v1215 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1203, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1231 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1204, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1499 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1487, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1515 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1488, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1909), v2174, + svreinterpret_u64_s16(v1073)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1927), v2174, + svreinterpret_u64_s16(v1089)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1982), v2174, + svreinterpret_u64_s16(v1207)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2000), v2174, + svreinterpret_u64_s16(v1223)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2073), v2174, + svreinterpret_u64_s16(v1357)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2091), v2174, + svreinterpret_u64_s16(v1373)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2146), v2174, + svreinterpret_u64_s16(v1491)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2164), v2174, + svreinterpret_u64_s16(v1507)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v1991), v2174, + svreinterpret_u64_s16(v1215)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2009), v2174, + svreinterpret_u64_s16(v1231)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2155), v2174, + svreinterpret_u64_s16(v1499)); + svst1w_scatter_s64index_u64(pred_full, (unsigned *)(v2173), v2174, + svreinterpret_u64_s16(v1515)); + v5 += v11; + v6 += v12; + } +} +#endif diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gs.h b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gs.h new file mode 100644 index 0000000000000000000000000000000000000000..e4103e8dfb48eefb2755d51448c9977f3c5920bd --- /dev/null +++ b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gs.h @@ -0,0 +1,50 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#pragma once + +#include "armral.h" +#include "fft_helper.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void(cf32_cf32_cs16_ab_t_gs_fft_t)(const armral_cmplx_f32_t *x, + armral_cmplx_int16_t *y, int istride, + int ostride, + const armral_cmplx_f32_t *w, + int howmany, int idist, int odist, + float dir); + +cf32_cf32_cs16_ab_t_gs_fft_t armral_fft_cf32_cf32_cs16_ab_t_gs2; +cf32_cf32_cs16_ab_t_gs_fft_t armral_fft_cf32_cf32_cs16_ab_t_gs3; +cf32_cf32_cs16_ab_t_gs_fft_t armral_fft_cf32_cf32_cs16_ab_t_gs4; +cf32_cf32_cs16_ab_t_gs_fft_t armral_fft_cf32_cf32_cs16_ab_t_gs5; +cf32_cf32_cs16_ab_t_gs_fft_t armral_fft_cf32_cf32_cs16_ab_t_gs6; +cf32_cf32_cs16_ab_t_gs_fft_t armral_fft_cf32_cf32_cs16_ab_t_gs7; +cf32_cf32_cs16_ab_t_gs_fft_t armral_fft_cf32_cf32_cs16_ab_t_gs8; +cf32_cf32_cs16_ab_t_gs_fft_t armral_fft_cf32_cf32_cs16_ab_t_gs9; +cf32_cf32_cs16_ab_t_gs_fft_t armral_fft_cf32_cf32_cs16_ab_t_gs10; +cf32_cf32_cs16_ab_t_gs_fft_t armral_fft_cf32_cf32_cs16_ab_t_gs11; +cf32_cf32_cs16_ab_t_gs_fft_t armral_fft_cf32_cf32_cs16_ab_t_gs12; +cf32_cf32_cs16_ab_t_gs_fft_t armral_fft_cf32_cf32_cs16_ab_t_gs13; +cf32_cf32_cs16_ab_t_gs_fft_t armral_fft_cf32_cf32_cs16_ab_t_gs14; +cf32_cf32_cs16_ab_t_gs_fft_t armral_fft_cf32_cf32_cs16_ab_t_gs15; +cf32_cf32_cs16_ab_t_gs_fft_t armral_fft_cf32_cf32_cs16_ab_t_gs16; +cf32_cf32_cs16_ab_t_gs_fft_t armral_fft_cf32_cf32_cs16_ab_t_gs17; +cf32_cf32_cs16_ab_t_gs_fft_t armral_fft_cf32_cf32_cs16_ab_t_gs18; +cf32_cf32_cs16_ab_t_gs_fft_t armral_fft_cf32_cf32_cs16_ab_t_gs19; +cf32_cf32_cs16_ab_t_gs_fft_t armral_fft_cf32_cf32_cs16_ab_t_gs20; +cf32_cf32_cs16_ab_t_gs_fft_t armral_fft_cf32_cf32_cs16_ab_t_gs21; +cf32_cf32_cs16_ab_t_gs_fft_t armral_fft_cf32_cf32_cs16_ab_t_gs22; +cf32_cf32_cs16_ab_t_gs_fft_t armral_fft_cf32_cf32_cs16_ab_t_gs24; +cf32_cf32_cs16_ab_t_gs_fft_t armral_fft_cf32_cf32_cs16_ab_t_gs25; +cf32_cf32_cs16_ab_t_gs_fft_t armral_fft_cf32_cf32_cs16_ab_t_gs32; + +#ifdef __cplusplus +} // extern "C" +#endif \ No newline at end of file diff --git a/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c b/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c index 2520ecdc1d8f5da4569870f6d46246af7927c065..9be118b34f2ccaff27266e4d04d754b4f41f76b8 100644 --- a/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c +++ b/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c @@ -110,15 +110,15 @@ static cf32_cf32_cf32_ac_n_gu_fft_t NULL, NULL, NULL, + armral_fft_cf32_cf32_cf32_ac_n_gu7, NULL, + armral_fft_cf32_cf32_cf32_ac_n_gu9, NULL, - NULL, - NULL, - NULL, + armral_fft_cf32_cf32_cf32_ac_n_gu11, NULL, armral_fft_cf32_cf32_cf32_ac_n_gu13, armral_fft_cf32_cf32_cf32_ac_n_gu14, - NULL, + armral_fft_cf32_cf32_cf32_ac_n_gu15, armral_fft_cf32_cf32_cf32_ac_n_gu16, armral_fft_cf32_cf32_cf32_ac_n_gu17, armral_fft_cf32_cf32_cf32_ac_n_gu18, @@ -225,7 +225,7 @@ static cf32_cf32_cf32_ab_t_gs_fft_t NULL, NULL, NULL, - NULL, + armral_fft_cf32_cf32_cf32_ab_t_gs32, NULL, NULL, NULL, diff --git a/src/LowerPHY/FFT/fft_cs16.cpp b/src/LowerPHY/FFT/fft_cs16.cpp index 9728d7b4e4a451f23db58660448067ff4e7821b5..d442dbb5d1c29d7b14029554c605760eaf297062 100644 --- a/src/LowerPHY/FFT/fft_cs16.cpp +++ b/src/LowerPHY/FFT/fft_cs16.cpp @@ -12,15 +12,22 @@ extern "C" { armral_status armral_fft_create_plan_cs16(armral_fft_plan_t **p, int n, armral_fft_direction_t dir) { - return armral::fft::create_plan(p, n, dir, true); + return armral::fft::create_plan_1d(p, n, dir, true); +} + +armral_status armral_fft_create_2d_plan_cs16(armral_fft_plan_t **p, int n0, + int n1, + armral_fft_direction_t dir) { + return armral::fft::create_plan_2d(p, n0, n1, dir, true); } armral_status armral_fft_execute_cs16(const armral_fft_plan_t *p, const armral_cmplx_int16_t *x, armral_cmplx_int16_t *y) { return armral::fft::execute(p, x, y, 1, 1, 1); + armral_cmplx_f32_t>(p, x, y); } armral_status armral_fft_destroy_plan_cs16(armral_fft_plan_t **p) { diff --git a/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_gu.c b/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_gu.c new file mode 100644 index 0000000000000000000000000000000000000000..27b76ddaf9b2f59b9085c6c7d4730b80b73d3b3d --- /dev/null +++ b/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_gu.c @@ -0,0 +1,16962 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#include "fft_cs16_cf32_cf32_ac_n_gu.h" + +#include +#ifdef ARMRAL_ARCH_SVE +#include +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu7(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v20 = vld1s_s16(&v5[istride]); + float v80 = -1.1666666666666665e+00F; + float v84 = 7.9015646852540022e-01F; + float v88 = 5.5854267289647742e-02F; + float v92 = 7.3430220123575241e-01F; + float v95 = 4.4095855184409838e-01F; + float v96 = -4.4095855184409838e-01F; + float v102 = 3.4087293062393137e-01F; + float v103 = -3.4087293062393137e-01F; + float v109 = -5.3396936033772524e-01F; + float v110 = 5.3396936033772524e-01F; + float v116 = 8.7484229096165667e-01F; + float v117 = -8.7484229096165667e-01F; + float32x2_t v119 = (float32x2_t){v4, v4}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v64 = vld1s_s16(&v5[0]); + float32x2_t v81 = (float32x2_t){v80, v80}; + float32x2_t v85 = (float32x2_t){v84, v84}; + float32x2_t v89 = (float32x2_t){v88, v88}; + float32x2_t v93 = (float32x2_t){v92, v92}; + float32x2_t v97 = (float32x2_t){v95, v96}; + float32x2_t v104 = (float32x2_t){v102, v103}; + float32x2_t v111 = (float32x2_t){v109, v110}; + float32x2_t v118 = (float32x2_t){v116, v117}; + int16x4_t v26 = vld1s_s16(&v5[istride * 6]); + int16x4_t v34 = vld1s_s16(&v5[istride * 4]); + int16x4_t v40 = vld1s_s16(&v5[istride * 3]); + int16x4_t v48 = vld1s_s16(&v5[istride * 2]); + int16x4_t v54 = vld1s_s16(&v5[istride * 5]); + float32x2_t v65 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v64)), 15); + float32x2_t v99 = vmul_f32(v119, v97); + float32x2_t v106 = vmul_f32(v119, v104); + float32x2_t v113 = vmul_f32(v119, v111); + float32x2_t v120 = vmul_f32(v119, v118); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v49 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v48)), 15); + float32x2_t v55 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v54)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v56 = vadd_f32(v49, v55); + float32x2_t v57 = vsub_f32(v49, v55); + float32x2_t v58 = vadd_f32(v28, v42); + float32x2_t v67 = vsub_f32(v28, v42); + float32x2_t v68 = vsub_f32(v42, v56); + float32x2_t v69 = vsub_f32(v56, v28); + float32x2_t v70 = vadd_f32(v29, v43); + float32x2_t v72 = vsub_f32(v29, v43); + float32x2_t v73 = vsub_f32(v43, v57); + float32x2_t v74 = vsub_f32(v57, v29); + float32x2_t v59 = vadd_f32(v58, v56); + float32x2_t v71 = vadd_f32(v70, v57); + float32x2_t v86 = vmul_f32(v67, v85); + float32x2_t v90 = vmul_f32(v68, v89); + float32x2_t v94 = vmul_f32(v69, v93); + float32x2_t v107 = vrev64_f32(v72); + float32x2_t v114 = vrev64_f32(v73); + float32x2_t v121 = vrev64_f32(v74); + float32x2_t v66 = vadd_f32(v59, v65); + float32x2_t v82 = vmul_f32(v59, v81); + float32x2_t v100 = vrev64_f32(v71); + float32x2_t v108 = vmul_f32(v107, v106); + float32x2_t v115 = vmul_f32(v114, v113); + float32x2_t v122 = vmul_f32(v121, v120); + float32x2_t v101 = vmul_f32(v100, v99); + float32x2_t v123 = vadd_f32(v66, v82); + v6[0] = v66; + float32x2_t v124 = vadd_f32(v123, v86); + float32x2_t v126 = vsub_f32(v123, v86); + float32x2_t v128 = vsub_f32(v123, v90); + float32x2_t v130 = vadd_f32(v101, v108); + float32x2_t v132 = vsub_f32(v101, v108); + float32x2_t v134 = vsub_f32(v101, v115); + float32x2_t v125 = vadd_f32(v124, v90); + float32x2_t v127 = vsub_f32(v126, v94); + float32x2_t v129 = vadd_f32(v128, v94); + float32x2_t v131 = vadd_f32(v130, v115); + float32x2_t v133 = vsub_f32(v132, v122); + float32x2_t v135 = vadd_f32(v134, v122); + float32x2_t v136 = vadd_f32(v125, v131); + float32x2_t v137 = vsub_f32(v125, v131); + float32x2_t v138 = vadd_f32(v127, v133); + float32x2_t v139 = vsub_f32(v127, v133); + float32x2_t v140 = vadd_f32(v129, v135); + float32x2_t v141 = vsub_f32(v129, v135); + v6[ostride] = v137; + v6[ostride * 2] = v139; + v6[ostride * 3] = v140; + v6[ostride * 4] = v141; + v6[ostride * 5] = v138; + v6[ostride * 6] = v136; + v5 += 1 * idist; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu7(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v97 = -1.1666666666666665e+00F; + float v102 = 7.9015646852540022e-01F; + float v107 = 5.5854267289647742e-02F; + float v112 = 7.3430220123575241e-01F; + float v117 = -4.4095855184409838e-01F; + float v124 = -3.4087293062393137e-01F; + float v131 = 5.3396936033772524e-01F; + float v138 = -8.7484229096165667e-01F; + const int32_t *v218 = &v5[v0]; + float32x2_t *v301 = &v6[v2]; + int64_t v27 = v0 * 6; + int64_t v37 = v0 * 4; + int64_t v45 = v0 * 3; + int64_t v55 = v0 * 2; + int64_t v63 = v0 * 5; + float v120 = v4 * v117; + float v127 = v4 * v124; + float v134 = v4 * v131; + float v141 = v4 * v138; + int64_t v178 = v2 * 2; + int64_t v185 = v2 * 3; + int64_t v192 = v2 * 4; + int64_t v199 = v2 * 5; + int64_t v206 = v2 * 6; + const int32_t *v273 = &v5[0]; + svint64_t v274 = svindex_s64(0, v1); + svfloat32_t v277 = svdup_n_f32(v97); + svfloat32_t v278 = svdup_n_f32(v102); + svfloat32_t v279 = svdup_n_f32(v107); + svfloat32_t v280 = svdup_n_f32(v112); + float32x2_t *v292 = &v6[0]; + svint16_t v220 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v218), v274)); + const int32_t *v227 = &v5[v27]; + const int32_t *v236 = &v5[v37]; + const int32_t *v245 = &v5[v45]; + const int32_t *v254 = &v5[v55]; + const int32_t *v263 = &v5[v63]; + svint16_t v275 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v273), v274)); + svfloat32_t v281 = svdup_n_f32(v120); + svfloat32_t v282 = svdup_n_f32(v127); + svfloat32_t v283 = svdup_n_f32(v134); + svfloat32_t v284 = svdup_n_f32(v141); + float32x2_t *v310 = &v6[v178]; + float32x2_t *v319 = &v6[v185]; + float32x2_t *v328 = &v6[v192]; + float32x2_t *v337 = &v6[v199]; + float32x2_t *v346 = &v6[v206]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v220, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v81 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v275, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svint16_t v229 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v227), v274)); + svint16_t v238 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v236), v274)); + svint16_t v247 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v245), v274)); + svint16_t v256 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v254), v274)); + svint16_t v265 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v263), v274)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v229, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v238, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v51 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v247, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v61 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v256, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v69 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v265, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v70 = svadd_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v71 = svsub_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v72 = svadd_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v83 = svsub_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v84 = svsub_f32_x(svptrue_b32(), v52, v70); + svfloat32_t v85 = svsub_f32_x(svptrue_b32(), v70, v34); + svfloat32_t v86 = svadd_f32_x(svptrue_b32(), v35, v53); + svfloat32_t v88 = svsub_f32_x(svptrue_b32(), v35, v53); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v53, v71); + svfloat32_t v90 = svsub_f32_x(svptrue_b32(), v71, v35); + svfloat32_t v73 = svadd_f32_x(svptrue_b32(), v72, v70); + svfloat32_t v87 = svadd_f32_x(svptrue_b32(), v86, v71); + svfloat32_t zero129 = svdup_n_f32(0); + svfloat32_t v129 = svcmla_f32_x(pred_full, zero129, v282, v88, 90); + svfloat32_t zero136 = svdup_n_f32(0); + svfloat32_t v136 = svcmla_f32_x(pred_full, zero136, v283, v89, 90); + svfloat32_t zero143 = svdup_n_f32(0); + svfloat32_t v143 = svcmla_f32_x(pred_full, zero143, v284, v90, 90); + svfloat32_t v82 = svadd_f32_x(svptrue_b32(), v73, v81); + svfloat32_t zero122 = svdup_n_f32(0); + svfloat32_t v122 = svcmla_f32_x(pred_full, zero122, v281, v87, 90); + svfloat32_t v144 = svmla_f32_x(pred_full, v82, v73, v277); + svfloat32_t v151 = svadd_f32_x(svptrue_b32(), v122, v129); + svfloat32_t v153 = svsub_f32_x(svptrue_b32(), v122, v129); + svfloat32_t v155 = svsub_f32_x(svptrue_b32(), v122, v136); + svst1_f64(pred_full, (double *)(v292), svreinterpret_f64_f32(v82)); + svfloat32_t v145 = svmla_f32_x(pred_full, v144, v83, v278); + svfloat32_t v147 = svmls_f32_x(pred_full, v144, v83, v278); + svfloat32_t v149 = svmls_f32_x(pred_full, v144, v84, v279); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v151, v136); + svfloat32_t v154 = svsub_f32_x(svptrue_b32(), v153, v143); + svfloat32_t v156 = svadd_f32_x(svptrue_b32(), v155, v143); + svfloat32_t v146 = svmla_f32_x(pred_full, v145, v84, v279); + svfloat32_t v148 = svmls_f32_x(pred_full, v147, v85, v280); + svfloat32_t v150 = svmla_f32_x(pred_full, v149, v85, v280); + svfloat32_t v157 = svadd_f32_x(svptrue_b32(), v146, v152); + svfloat32_t v158 = svsub_f32_x(svptrue_b32(), v146, v152); + svfloat32_t v159 = svadd_f32_x(svptrue_b32(), v148, v154); + svfloat32_t v160 = svsub_f32_x(svptrue_b32(), v148, v154); + svfloat32_t v161 = svadd_f32_x(svptrue_b32(), v150, v156); + svfloat32_t v162 = svsub_f32_x(svptrue_b32(), v150, v156); + svst1_f64(pred_full, (double *)(v301), svreinterpret_f64_f32(v158)); + svst1_f64(pred_full, (double *)(v310), svreinterpret_f64_f32(v160)); + svst1_f64(pred_full, (double *)(v319), svreinterpret_f64_f32(v161)); + svst1_f64(pred_full, (double *)(v328), svreinterpret_f64_f32(v162)); + svst1_f64(pred_full, (double *)(v337), svreinterpret_f64_f32(v159)); + svst1_f64(pred_full, (double *)(v346), svreinterpret_f64_f32(v157)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu9(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v20 = vld1s_s16(&v5[istride]); + float v95 = -5.0000000000000000e-01F; + float v106 = -1.4999999999999998e+00F; + float v109 = 8.6602540378443871e-01F; + float v110 = -8.6602540378443871e-01F; + float v117 = 7.6604444311897801e-01F; + float v121 = 9.3969262078590832e-01F; + float v125 = -1.7364817766693039e-01F; + float v128 = 6.4278760968653925e-01F; + float v129 = -6.4278760968653925e-01F; + float v135 = -3.4202014332566888e-01F; + float v136 = 3.4202014332566888e-01F; + float v142 = 9.8480775301220802e-01F; + float v143 = -9.8480775301220802e-01F; + float32x2_t v145 = (float32x2_t){v4, v4}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v79 = vld1s_s16(&v5[0]); + float32x2_t v96 = (float32x2_t){v95, v95}; + float32x2_t v107 = (float32x2_t){v106, v106}; + float32x2_t v111 = (float32x2_t){v109, v110}; + float32x2_t v118 = (float32x2_t){v117, v117}; + float32x2_t v122 = (float32x2_t){v121, v121}; + float32x2_t v126 = (float32x2_t){v125, v125}; + float32x2_t v130 = (float32x2_t){v128, v129}; + float32x2_t v137 = (float32x2_t){v135, v136}; + float32x2_t v144 = (float32x2_t){v142, v143}; + int16x4_t v26 = vld1s_s16(&v5[istride * 8]); + int16x4_t v34 = vld1s_s16(&v5[istride * 7]); + int16x4_t v40 = vld1s_s16(&v5[istride * 2]); + int16x4_t v48 = vld1s_s16(&v5[istride * 3]); + int16x4_t v54 = vld1s_s16(&v5[istride * 6]); + int16x4_t v62 = vld1s_s16(&v5[istride * 4]); + int16x4_t v68 = vld1s_s16(&v5[istride * 5]); + float32x2_t v80 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v79)), 15); + float32x2_t v113 = vmul_f32(v145, v111); + float32x2_t v132 = vmul_f32(v145, v130); + float32x2_t v139 = vmul_f32(v145, v137); + float32x2_t v146 = vmul_f32(v145, v144); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v49 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v48)), 15); + float32x2_t v55 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v54)), 15); + float32x2_t v63 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v62)), 15); + float32x2_t v69 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v68)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v56 = vadd_f32(v49, v55); + float32x2_t v57 = vsub_f32(v49, v55); + float32x2_t v70 = vadd_f32(v63, v69); + float32x2_t v71 = vsub_f32(v63, v69); + float32x2_t v72 = vadd_f32(v28, v42); + float32x2_t v82 = vadd_f32(v29, v43); + float32x2_t v84 = vsub_f32(v28, v42); + float32x2_t v85 = vsub_f32(v42, v70); + float32x2_t v86 = vsub_f32(v70, v28); + float32x2_t v87 = vsub_f32(v29, v43); + float32x2_t v88 = vsub_f32(v43, v71); + float32x2_t v89 = vsub_f32(v71, v29); + float32x2_t v108 = vmul_f32(v56, v107); + float32x2_t v114 = vrev64_f32(v57); + float32x2_t v73 = vadd_f32(v72, v70); + float32x2_t v83 = vadd_f32(v82, v71); + float32x2_t v115 = vmul_f32(v114, v113); + float32x2_t v119 = vmul_f32(v84, v118); + float32x2_t v123 = vmul_f32(v85, v122); + float32x2_t v127 = vmul_f32(v86, v126); + float32x2_t v133 = vrev64_f32(v87); + float32x2_t v140 = vrev64_f32(v88); + float32x2_t v147 = vrev64_f32(v89); + float32x2_t v74 = vadd_f32(v73, v56); + float32x2_t v97 = vmul_f32(v73, v96); + float32x2_t v103 = vrev64_f32(v83); + float32x2_t v134 = vmul_f32(v133, v132); + float32x2_t v141 = vmul_f32(v140, v139); + float32x2_t v148 = vmul_f32(v147, v146); + float32x2_t v81 = vadd_f32(v74, v80); + float32x2_t v104 = vmul_f32(v103, v113); + float32x2_t v149 = vadd_f32(v97, v97); + float32x2_t v162 = vadd_f32(v115, v134); + float32x2_t v164 = vsub_f32(v115, v141); + float32x2_t v166 = vsub_f32(v115, v134); + float32x2_t v150 = vadd_f32(v149, v97); + float32x2_t v154 = vadd_f32(v81, v108); + float32x2_t v163 = vadd_f32(v162, v141); + float32x2_t v165 = vadd_f32(v164, v148); + float32x2_t v167 = vsub_f32(v166, v148); + v6[0] = v81; + float32x2_t v151 = vadd_f32(v81, v150); + float32x2_t v155 = vadd_f32(v154, v149); + float32x2_t v152 = vadd_f32(v151, v104); + float32x2_t v153 = vsub_f32(v151, v104); + float32x2_t v156 = vadd_f32(v155, v119); + float32x2_t v158 = vsub_f32(v155, v123); + float32x2_t v160 = vsub_f32(v155, v119); + float32x2_t v157 = vadd_f32(v156, v123); + float32x2_t v159 = vadd_f32(v158, v127); + float32x2_t v161 = vsub_f32(v160, v127); + v6[ostride * 3] = v153; + v6[ostride * 6] = v152; + float32x2_t v168 = vadd_f32(v157, v163); + float32x2_t v169 = vsub_f32(v157, v163); + float32x2_t v170 = vadd_f32(v159, v165); + float32x2_t v171 = vsub_f32(v159, v165); + float32x2_t v172 = vadd_f32(v161, v167); + float32x2_t v173 = vsub_f32(v161, v167); + v6[ostride] = v169; + v6[ostride * 2] = v170; + v6[ostride * 4] = v173; + v6[ostride * 5] = v172; + v6[ostride * 7] = v171; + v6[ostride * 8] = v168; + v5 += 1 * idist; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu9(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v116 = -5.0000000000000000e-01F; + float v128 = -1.4999999999999998e+00F; + float v133 = -8.6602540378443871e-01F; + float v140 = 7.6604444311897801e-01F; + float v145 = 9.3969262078590832e-01F; + float v150 = -1.7364817766693039e-01F; + float v155 = -6.4278760968653925e-01F; + float v162 = 3.4202014332566888e-01F; + float v169 = -9.8480775301220802e-01F; + const int32_t *v269 = &v5[v0]; + float32x2_t *v372 = &v6[v2]; + int64_t v27 = v0 * 8; + int64_t v37 = v0 * 7; + int64_t v45 = v0 * 2; + int64_t v55 = v0 * 3; + int64_t v63 = v0 * 6; + int64_t v73 = v0 * 4; + int64_t v81 = v0 * 5; + float v136 = v4 * v133; + float v158 = v4 * v155; + float v165 = v4 * v162; + float v172 = v4 * v169; + int64_t v215 = v2 * 2; + int64_t v222 = v2 * 3; + int64_t v229 = v2 * 4; + int64_t v236 = v2 * 5; + int64_t v243 = v2 * 6; + int64_t v250 = v2 * 7; + int64_t v257 = v2 * 8; + const int32_t *v342 = &v5[0]; + svint64_t v343 = svindex_s64(0, v1); + svfloat32_t v346 = svdup_n_f32(v116); + svfloat32_t v348 = svdup_n_f32(v128); + svfloat32_t v350 = svdup_n_f32(v140); + svfloat32_t v351 = svdup_n_f32(v145); + svfloat32_t v352 = svdup_n_f32(v150); + float32x2_t *v363 = &v6[0]; + svint16_t v271 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v269), v343)); + const int32_t *v278 = &v5[v27]; + const int32_t *v287 = &v5[v37]; + const int32_t *v296 = &v5[v45]; + const int32_t *v305 = &v5[v55]; + const int32_t *v314 = &v5[v63]; + const int32_t *v323 = &v5[v73]; + const int32_t *v332 = &v5[v81]; + svint16_t v344 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v342), v343)); + svfloat32_t v349 = svdup_n_f32(v136); + svfloat32_t v353 = svdup_n_f32(v158); + svfloat32_t v354 = svdup_n_f32(v165); + svfloat32_t v355 = svdup_n_f32(v172); + float32x2_t *v381 = &v6[v215]; + float32x2_t *v390 = &v6[v222]; + float32x2_t *v399 = &v6[v229]; + float32x2_t *v408 = &v6[v236]; + float32x2_t *v417 = &v6[v243]; + float32x2_t *v426 = &v6[v250]; + float32x2_t *v435 = &v6[v257]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v271, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v100 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v344, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svint16_t v280 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v278), v343)); + svint16_t v289 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v287), v343)); + svint16_t v298 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v296), v343)); + svint16_t v307 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v305), v343)); + svint16_t v316 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v314), v343)); + svint16_t v325 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v323), v343)); + svint16_t v334 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v332), v343)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v280, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v289, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v51 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v298, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v61 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v307, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v69 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v316, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v79 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v325, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v87 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v334, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v70 = svadd_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v71 = svsub_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v90 = svadd_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v35, v53); + svfloat32_t v104 = svsub_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v105 = svsub_f32_x(svptrue_b32(), v52, v88); + svfloat32_t v106 = svsub_f32_x(svptrue_b32(), v88, v34); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v35, v53); + svfloat32_t v108 = svsub_f32_x(svptrue_b32(), v53, v89); + svfloat32_t v109 = svsub_f32_x(svptrue_b32(), v89, v35); + svfloat32_t zero138 = svdup_n_f32(0); + svfloat32_t v138 = svcmla_f32_x(pred_full, zero138, v349, v71, 90); + svfloat32_t v91 = svadd_f32_x(svptrue_b32(), v90, v88); + svfloat32_t v103 = svadd_f32_x(svptrue_b32(), v102, v89); + svfloat32_t zero160 = svdup_n_f32(0); + svfloat32_t v160 = svcmla_f32_x(pred_full, zero160, v353, v107, 90); + svfloat32_t zero167 = svdup_n_f32(0); + svfloat32_t v167 = svcmla_f32_x(pred_full, zero167, v354, v108, 90); + svfloat32_t zero174 = svdup_n_f32(0); + svfloat32_t v174 = svcmla_f32_x(pred_full, zero174, v355, v109, 90); + svfloat32_t v92 = svadd_f32_x(svptrue_b32(), v91, v70); + svfloat32_t v119 = svmul_f32_x(svptrue_b32(), v91, v346); + svfloat32_t zero126 = svdup_n_f32(0); + svfloat32_t v126 = svcmla_f32_x(pred_full, zero126, v349, v103, 90); + svfloat32_t v188 = svadd_f32_x(svptrue_b32(), v138, v160); + svfloat32_t v190 = svsub_f32_x(svptrue_b32(), v138, v167); + svfloat32_t v192 = svsub_f32_x(svptrue_b32(), v138, v160); + svfloat32_t v101 = svadd_f32_x(svptrue_b32(), v92, v100); + svfloat32_t v175 = svadd_f32_x(svptrue_b32(), v119, v119); + svfloat32_t v189 = svadd_f32_x(svptrue_b32(), v188, v167); + svfloat32_t v191 = svadd_f32_x(svptrue_b32(), v190, v174); + svfloat32_t v193 = svsub_f32_x(svptrue_b32(), v192, v174); + svfloat32_t v176 = svmla_f32_x(pred_full, v175, v91, v346); + svfloat32_t v180 = svmla_f32_x(pred_full, v101, v70, v348); + svst1_f64(pred_full, (double *)(v363), svreinterpret_f64_f32(v101)); + svfloat32_t v177 = svadd_f32_x(svptrue_b32(), v101, v176); + svfloat32_t v181 = svadd_f32_x(svptrue_b32(), v180, v175); + svfloat32_t v178 = svadd_f32_x(svptrue_b32(), v177, v126); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v177, v126); + svfloat32_t v182 = svmla_f32_x(pred_full, v181, v104, v350); + svfloat32_t v184 = svmls_f32_x(pred_full, v181, v105, v351); + svfloat32_t v186 = svmls_f32_x(pred_full, v181, v104, v350); + svfloat32_t v183 = svmla_f32_x(pred_full, v182, v105, v351); + svfloat32_t v185 = svmla_f32_x(pred_full, v184, v106, v352); + svfloat32_t v187 = svmls_f32_x(pred_full, v186, v106, v352); + svst1_f64(pred_full, (double *)(v390), svreinterpret_f64_f32(v179)); + svst1_f64(pred_full, (double *)(v417), svreinterpret_f64_f32(v178)); + svfloat32_t v194 = svadd_f32_x(svptrue_b32(), v183, v189); + svfloat32_t v195 = svsub_f32_x(svptrue_b32(), v183, v189); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v185, v191); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v185, v191); + svfloat32_t v198 = svadd_f32_x(svptrue_b32(), v187, v193); + svfloat32_t v199 = svsub_f32_x(svptrue_b32(), v187, v193); + svst1_f64(pred_full, (double *)(v372), svreinterpret_f64_f32(v195)); + svst1_f64(pred_full, (double *)(v381), svreinterpret_f64_f32(v196)); + svst1_f64(pred_full, (double *)(v399), svreinterpret_f64_f32(v199)); + svst1_f64(pred_full, (double *)(v408), svreinterpret_f64_f32(v198)); + svst1_f64(pred_full, (double *)(v426), svreinterpret_f64_f32(v197)); + svst1_f64(pred_full, (double *)(v435), svreinterpret_f64_f32(v194)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu11(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v20 = vld1s_s16(&v5[istride]); + float v124 = 1.1000000000000001e+00F; + float v127 = 3.3166247903554003e-01F; + float v128 = -3.3166247903554003e-01F; + float v135 = 5.1541501300188641e-01F; + float v139 = 9.4125353283118118e-01F; + float v143 = 1.4143537075597825e+00F; + float v147 = 8.5949297361449750e-01F; + float v151 = 4.2314838273285138e-02F; + float v155 = 3.8639279888589606e-01F; + float v159 = 5.1254589567200015e-01F; + float v163 = 1.0702757469471715e+00F; + float v167 = 5.5486073394528512e-01F; + float v170 = 1.2412944743900585e+00F; + float v171 = -1.2412944743900585e+00F; + float v177 = 2.0897833842005756e-01F; + float v178 = -2.0897833842005756e-01F; + float v184 = 3.7415717312460811e-01F; + float v185 = -3.7415717312460811e-01F; + float v191 = 4.9929922194110327e-02F; + float v192 = -4.9929922194110327e-02F; + float v198 = 6.5815896284539266e-01F; + float v199 = -6.5815896284539266e-01F; + float v205 = 6.3306543373877577e-01F; + float v206 = -6.3306543373877577e-01F; + float v212 = 1.0822460581641109e+00F; + float v213 = -1.0822460581641109e+00F; + float v219 = 8.1720737907134022e-01F; + float v220 = -8.1720737907134022e-01F; + float v226 = 4.2408709531871824e-01F; + float v227 = -4.2408709531871824e-01F; + float32x2_t v229 = (float32x2_t){v4, v4}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v96 = vld1s_s16(&v5[0]); + float32x2_t v125 = (float32x2_t){v124, v124}; + float32x2_t v129 = (float32x2_t){v127, v128}; + float32x2_t v136 = (float32x2_t){v135, v135}; + float32x2_t v140 = (float32x2_t){v139, v139}; + float32x2_t v144 = (float32x2_t){v143, v143}; + float32x2_t v148 = (float32x2_t){v147, v147}; + float32x2_t v152 = (float32x2_t){v151, v151}; + float32x2_t v156 = (float32x2_t){v155, v155}; + float32x2_t v160 = (float32x2_t){v159, v159}; + float32x2_t v164 = (float32x2_t){v163, v163}; + float32x2_t v168 = (float32x2_t){v167, v167}; + float32x2_t v172 = (float32x2_t){v170, v171}; + float32x2_t v179 = (float32x2_t){v177, v178}; + float32x2_t v186 = (float32x2_t){v184, v185}; + float32x2_t v193 = (float32x2_t){v191, v192}; + float32x2_t v200 = (float32x2_t){v198, v199}; + float32x2_t v207 = (float32x2_t){v205, v206}; + float32x2_t v214 = (float32x2_t){v212, v213}; + float32x2_t v221 = (float32x2_t){v219, v220}; + float32x2_t v228 = (float32x2_t){v226, v227}; + int16x4_t v26 = vld1s_s16(&v5[istride * 10]); + int16x4_t v33 = vld1s_s16(&v5[istride * 2]); + int16x4_t v39 = vld1s_s16(&v5[istride * 9]); + int16x4_t v46 = vld1s_s16(&v5[istride * 3]); + int16x4_t v52 = vld1s_s16(&v5[istride * 8]); + int16x4_t v59 = vld1s_s16(&v5[istride * 4]); + int16x4_t v65 = vld1s_s16(&v5[istride * 7]); + int16x4_t v72 = vld1s_s16(&v5[istride * 5]); + int16x4_t v78 = vld1s_s16(&v5[istride * 6]); + float32x2_t v97 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v96)), 15); + float32x2_t v131 = vmul_f32(v229, v129); + float32x2_t v174 = vmul_f32(v229, v172); + float32x2_t v181 = vmul_f32(v229, v179); + float32x2_t v188 = vmul_f32(v229, v186); + float32x2_t v195 = vmul_f32(v229, v193); + float32x2_t v202 = vmul_f32(v229, v200); + float32x2_t v209 = vmul_f32(v229, v207); + float32x2_t v216 = vmul_f32(v229, v214); + float32x2_t v223 = vmul_f32(v229, v221); + float32x2_t v230 = vmul_f32(v229, v228); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v34 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v33)), 15); + float32x2_t v40 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v39)), 15); + float32x2_t v47 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v46)), 15); + float32x2_t v53 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v52)), 15); + float32x2_t v60 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v59)), 15); + float32x2_t v66 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v65)), 15); + float32x2_t v73 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v72)), 15); + float32x2_t v79 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v78)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v41 = vadd_f32(v34, v40); + float32x2_t v54 = vadd_f32(v47, v53); + float32x2_t v67 = vadd_f32(v60, v66); + float32x2_t v80 = vadd_f32(v73, v79); + float32x2_t v81 = vsub_f32(v21, v27); + float32x2_t v82 = vsub_f32(v34, v40); + float32x2_t v83 = vsub_f32(v47, v53); + float32x2_t v84 = vsub_f32(v60, v66); + float32x2_t v85 = vsub_f32(v73, v79); + float32x2_t v86 = vadd_f32(v28, v41); + float32x2_t v87 = vadd_f32(v54, v80); + float32x2_t v89 = vsub_f32(v82, v83); + float32x2_t v90 = vadd_f32(v81, v85); + float32x2_t v101 = vsub_f32(v41, v67); + float32x2_t v102 = vsub_f32(v28, v67); + float32x2_t v103 = vsub_f32(v41, v28); + float32x2_t v104 = vsub_f32(v80, v67); + float32x2_t v105 = vsub_f32(v54, v67); + float32x2_t v106 = vsub_f32(v80, v54); + float32x2_t v107 = vsub_f32(v41, v80); + float32x2_t v108 = vsub_f32(v28, v54); + float32x2_t v110 = vadd_f32(v82, v84); + float32x2_t v111 = vsub_f32(v81, v84); + float32x2_t v112 = vadd_f32(v81, v82); + float32x2_t v113 = vsub_f32(v84, v85); + float32x2_t v114 = vsub_f32(v83, v84); + float32x2_t v115 = vsub_f32(v83, v85); + float32x2_t v116 = vadd_f32(v82, v85); + float32x2_t v117 = vsub_f32(v81, v83); + float32x2_t v88 = vadd_f32(v67, v86); + float32x2_t v99 = vsub_f32(v89, v90); + float32x2_t v109 = vsub_f32(v87, v86); + float32x2_t v118 = vadd_f32(v89, v90); + float32x2_t v137 = vmul_f32(v101, v136); + float32x2_t v141 = vmul_f32(v102, v140); + float32x2_t v145 = vmul_f32(v103, v144); + float32x2_t v149 = vmul_f32(v104, v148); + float32x2_t v153 = vmul_f32(v105, v152); + float32x2_t v157 = vmul_f32(v106, v156); + float32x2_t v161 = vmul_f32(v107, v160); + float32x2_t v165 = vmul_f32(v108, v164); + float32x2_t v175 = vrev64_f32(v110); + float32x2_t v182 = vrev64_f32(v111); + float32x2_t v189 = vrev64_f32(v112); + float32x2_t v196 = vrev64_f32(v113); + float32x2_t v203 = vrev64_f32(v114); + float32x2_t v210 = vrev64_f32(v115); + float32x2_t v217 = vrev64_f32(v116); + float32x2_t v224 = vrev64_f32(v117); + float32x2_t v91 = vadd_f32(v88, v87); + float32x2_t v100 = vsub_f32(v99, v84); + float32x2_t v169 = vmul_f32(v109, v168); + float32x2_t v176 = vmul_f32(v175, v174); + float32x2_t v183 = vmul_f32(v182, v181); + float32x2_t v190 = vmul_f32(v189, v188); + float32x2_t v197 = vmul_f32(v196, v195); + float32x2_t v204 = vmul_f32(v203, v202); + float32x2_t v211 = vmul_f32(v210, v209); + float32x2_t v218 = vmul_f32(v217, v216); + float32x2_t v225 = vmul_f32(v224, v223); + float32x2_t v231 = vrev64_f32(v118); + float32x2_t v234 = vadd_f32(v137, v141); + float32x2_t v235 = vadd_f32(v141, v145); + float32x2_t v236 = vsub_f32(v137, v145); + float32x2_t v237 = vadd_f32(v149, v153); + float32x2_t v238 = vadd_f32(v153, v157); + float32x2_t v239 = vsub_f32(v149, v157); + float32x2_t v98 = vadd_f32(v97, v91); + float32x2_t v126 = vmul_f32(v91, v125); + float32x2_t v132 = vrev64_f32(v100); + float32x2_t v232 = vmul_f32(v231, v230); + float32x2_t v240 = vadd_f32(v165, v169); + float32x2_t v241 = vadd_f32(v161, v169); + float32x2_t v242 = vadd_f32(v183, v190); + float32x2_t v243 = vsub_f32(v176, v190); + float32x2_t v244 = vadd_f32(v204, v211); + float32x2_t v245 = vsub_f32(v197, v211); + float32x2_t v133 = vmul_f32(v132, v131); + float32x2_t v233 = vsub_f32(v98, v126); + float32x2_t v246 = vadd_f32(v225, v232); + float32x2_t v247 = vsub_f32(v218, v232); + float32x2_t v248 = vadd_f32(v238, v240); + float32x2_t v266 = vadd_f32(v242, v243); + v6[0] = v98; + float32x2_t v249 = vadd_f32(v248, v233); + float32x2_t v250 = vsub_f32(v233, v235); + float32x2_t v252 = vadd_f32(v233, v239); + float32x2_t v254 = vsub_f32(v233, v236); + float32x2_t v256 = vadd_f32(v233, v234); + float32x2_t v258 = vadd_f32(v133, v244); + float32x2_t v260 = vsub_f32(v246, v242); + float32x2_t v262 = vadd_f32(v133, v247); + float32x2_t v264 = vsub_f32(v247, v243); + float32x2_t v267 = vadd_f32(v266, v244); + float32x2_t v251 = vsub_f32(v250, v240); + float32x2_t v253 = vadd_f32(v252, v241); + float32x2_t v255 = vsub_f32(v254, v241); + float32x2_t v257 = vsub_f32(v256, v237); + float32x2_t v259 = vadd_f32(v258, v246); + float32x2_t v261 = vsub_f32(v260, v133); + float32x2_t v263 = vadd_f32(v262, v245); + float32x2_t v265 = vsub_f32(v264, v133); + float32x2_t v268 = vadd_f32(v267, v245); + float32x2_t v269 = vsub_f32(v268, v133); + float32x2_t v271 = vadd_f32(v249, v259); + float32x2_t v272 = vadd_f32(v251, v261); + float32x2_t v273 = vsub_f32(v253, v263); + float32x2_t v274 = vadd_f32(v255, v265); + float32x2_t v275 = vsub_f32(v255, v265); + float32x2_t v276 = vadd_f32(v253, v263); + float32x2_t v277 = vsub_f32(v251, v261); + float32x2_t v278 = vsub_f32(v249, v259); + float32x2_t v270 = vadd_f32(v257, v269); + float32x2_t v279 = vsub_f32(v257, v269); + v6[ostride * 9] = v271; + v6[ostride * 8] = v272; + v6[ostride * 7] = v273; + v6[ostride * 6] = v274; + v6[ostride * 5] = v275; + v6[ostride * 4] = v276; + v6[ostride * 3] = v277; + v6[ostride * 2] = v278; + v6[ostride * 10] = v270; + v6[ostride] = v279; + v5 += 1 * idist; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu11(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v149 = 1.1000000000000001e+00F; + float v154 = -3.3166247903554003e-01F; + float v161 = 5.1541501300188641e-01F; + float v166 = 9.4125353283118118e-01F; + float v171 = 1.4143537075597825e+00F; + float v176 = 8.5949297361449750e-01F; + float v181 = 4.2314838273285138e-02F; + float v186 = 3.8639279888589606e-01F; + float v191 = 5.1254589567200015e-01F; + float v196 = 1.0702757469471715e+00F; + float v201 = 5.5486073394528512e-01F; + float v206 = -1.2412944743900585e+00F; + float v213 = -2.0897833842005756e-01F; + float v220 = -3.7415717312460811e-01F; + float v227 = -4.9929922194110327e-02F; + float v234 = -6.5815896284539266e-01F; + float v241 = -6.3306543373877577e-01F; + float v248 = -1.0822460581641109e+00F; + float v255 = -8.1720737907134022e-01F; + float v262 = -4.2408709531871824e-01F; + const int32_t *v398 = &v5[v0]; + float32x2_t *v610 = &v6[v2]; + int64_t v27 = v0 * 10; + int64_t v36 = v0 * 2; + int64_t v44 = v0 * 9; + int64_t v53 = v0 * 3; + int64_t v61 = v0 * 8; + int64_t v70 = v0 * 4; + int64_t v78 = v0 * 7; + int64_t v87 = v0 * 5; + int64_t v95 = v0 * 6; + float v157 = v4 * v154; + float v209 = v4 * v206; + float v216 = v4 * v213; + float v223 = v4 * v220; + float v230 = v4 * v227; + float v237 = v4 * v234; + float v244 = v4 * v241; + float v251 = v4 * v248; + float v258 = v4 * v255; + float v265 = v4 * v262; + int64_t v323 = v2 * 10; + int64_t v330 = v2 * 9; + int64_t v337 = v2 * 8; + int64_t v344 = v2 * 7; + int64_t v351 = v2 * 6; + int64_t v358 = v2 * 5; + int64_t v365 = v2 * 4; + int64_t v372 = v2 * 3; + int64_t v379 = v2 * 2; + const int32_t *v489 = &v5[0]; + svint64_t v490 = svindex_s64(0, v1); + svfloat32_t v493 = svdup_n_f32(v149); + svfloat32_t v495 = svdup_n_f32(v161); + svfloat32_t v496 = svdup_n_f32(v166); + svfloat32_t v497 = svdup_n_f32(v171); + svfloat32_t v498 = svdup_n_f32(v176); + svfloat32_t v499 = svdup_n_f32(v181); + svfloat32_t v500 = svdup_n_f32(v186); + svfloat32_t v501 = svdup_n_f32(v191); + svfloat32_t v502 = svdup_n_f32(v196); + svfloat32_t v503 = svdup_n_f32(v201); + float32x2_t *v520 = &v6[0]; + svint16_t v400 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v398), v490)); + const int32_t *v407 = &v5[v27]; + const int32_t *v416 = &v5[v36]; + const int32_t *v425 = &v5[v44]; + const int32_t *v434 = &v5[v53]; + const int32_t *v443 = &v5[v61]; + const int32_t *v452 = &v5[v70]; + const int32_t *v461 = &v5[v78]; + const int32_t *v470 = &v5[v87]; + const int32_t *v479 = &v5[v95]; + svint16_t v491 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v489), v490)); + svfloat32_t v494 = svdup_n_f32(v157); + svfloat32_t v504 = svdup_n_f32(v209); + svfloat32_t v505 = svdup_n_f32(v216); + svfloat32_t v506 = svdup_n_f32(v223); + svfloat32_t v507 = svdup_n_f32(v230); + svfloat32_t v508 = svdup_n_f32(v237); + svfloat32_t v509 = svdup_n_f32(v244); + svfloat32_t v510 = svdup_n_f32(v251); + svfloat32_t v511 = svdup_n_f32(v258); + svfloat32_t v512 = svdup_n_f32(v265); + float32x2_t *v529 = &v6[v323]; + float32x2_t *v538 = &v6[v330]; + float32x2_t *v547 = &v6[v337]; + float32x2_t *v556 = &v6[v344]; + float32x2_t *v565 = &v6[v351]; + float32x2_t *v574 = &v6[v358]; + float32x2_t *v583 = &v6[v365]; + float32x2_t *v592 = &v6[v372]; + float32x2_t *v601 = &v6[v379]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v400, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v121 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v491, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svint16_t v409 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v407), v490)); + svint16_t v418 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v416), v490)); + svint16_t v427 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v425), v490)); + svint16_t v436 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v434), v490)); + svint16_t v445 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v443), v490)); + svint16_t v454 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v452), v490)); + svint16_t v463 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v461), v490)); + svint16_t v472 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v470), v490)); + svint16_t v481 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v479), v490)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v409, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v42 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v418, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v50 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v427, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v59 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v436, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v67 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v445, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v76 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v454, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v84 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v463, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v93 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v472, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v101 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v481, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v51 = svadd_f32_x(svptrue_b32(), v42, v50); + svfloat32_t v68 = svadd_f32_x(svptrue_b32(), v59, v67); + svfloat32_t v85 = svadd_f32_x(svptrue_b32(), v76, v84); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v93, v101); + svfloat32_t v103 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v104 = svsub_f32_x(svptrue_b32(), v42, v50); + svfloat32_t v105 = svsub_f32_x(svptrue_b32(), v59, v67); + svfloat32_t v106 = svsub_f32_x(svptrue_b32(), v76, v84); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v93, v101); + svfloat32_t v108 = svadd_f32_x(svptrue_b32(), v34, v51); + svfloat32_t v109 = svadd_f32_x(svptrue_b32(), v68, v102); + svfloat32_t v111 = svsub_f32_x(svptrue_b32(), v104, v105); + svfloat32_t v112 = svadd_f32_x(svptrue_b32(), v103, v107); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v51, v85); + svfloat32_t v126 = svsub_f32_x(svptrue_b32(), v34, v85); + svfloat32_t v127 = svsub_f32_x(svptrue_b32(), v51, v34); + svfloat32_t v128 = svsub_f32_x(svptrue_b32(), v102, v85); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v68, v85); + svfloat32_t v130 = svsub_f32_x(svptrue_b32(), v102, v68); + svfloat32_t v131 = svsub_f32_x(svptrue_b32(), v51, v102); + svfloat32_t v132 = svsub_f32_x(svptrue_b32(), v34, v68); + svfloat32_t v134 = svadd_f32_x(svptrue_b32(), v104, v106); + svfloat32_t v135 = svsub_f32_x(svptrue_b32(), v103, v106); + svfloat32_t v136 = svadd_f32_x(svptrue_b32(), v103, v104); + svfloat32_t v137 = svsub_f32_x(svptrue_b32(), v106, v107); + svfloat32_t v138 = svsub_f32_x(svptrue_b32(), v105, v106); + svfloat32_t v139 = svsub_f32_x(svptrue_b32(), v105, v107); + svfloat32_t v140 = svadd_f32_x(svptrue_b32(), v104, v107); + svfloat32_t v141 = svsub_f32_x(svptrue_b32(), v103, v105); + svfloat32_t v110 = svadd_f32_x(svptrue_b32(), v85, v108); + svfloat32_t v123 = svsub_f32_x(svptrue_b32(), v111, v112); + svfloat32_t v133 = svsub_f32_x(svptrue_b32(), v109, v108); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v111, v112); + svfloat32_t v169 = svmul_f32_x(svptrue_b32(), v126, v496); + svfloat32_t v174 = svmul_f32_x(svptrue_b32(), v127, v497); + svfloat32_t v184 = svmul_f32_x(svptrue_b32(), v129, v499); + svfloat32_t v189 = svmul_f32_x(svptrue_b32(), v130, v500); + svfloat32_t zero211 = svdup_n_f32(0); + svfloat32_t v211 = svcmla_f32_x(pred_full, zero211, v504, v134, 90); + svfloat32_t zero225 = svdup_n_f32(0); + svfloat32_t v225 = svcmla_f32_x(pred_full, zero225, v506, v136, 90); + svfloat32_t zero232 = svdup_n_f32(0); + svfloat32_t v232 = svcmla_f32_x(pred_full, zero232, v507, v137, 90); + svfloat32_t zero246 = svdup_n_f32(0); + svfloat32_t v246 = svcmla_f32_x(pred_full, zero246, v509, v139, 90); + svfloat32_t zero253 = svdup_n_f32(0); + svfloat32_t v253 = svcmla_f32_x(pred_full, zero253, v510, v140, 90); + svfloat32_t v113 = svadd_f32_x(svptrue_b32(), v110, v109); + svfloat32_t v124 = svsub_f32_x(svptrue_b32(), v123, v106); + svfloat32_t v204 = svmul_f32_x(svptrue_b32(), v133, v503); + svfloat32_t zero267 = svdup_n_f32(0); + svfloat32_t v267 = svcmla_f32_x(pred_full, zero267, v512, v142, 90); + svfloat32_t v269 = svmla_f32_x(pred_full, v169, v125, v495); + svfloat32_t v270 = svmla_f32_x(pred_full, v174, v126, v496); + svfloat32_t v271 = svnmls_f32_x(pred_full, v174, v125, v495); + svfloat32_t v272 = svmla_f32_x(pred_full, v184, v128, v498); + svfloat32_t v273 = svmla_f32_x(pred_full, v189, v129, v499); + svfloat32_t v274 = svnmls_f32_x(pred_full, v189, v128, v498); + svfloat32_t v277 = svcmla_f32_x(pred_full, v225, v505, v135, 90); + svfloat32_t v278 = svsub_f32_x(svptrue_b32(), v211, v225); + svfloat32_t v279 = svcmla_f32_x(pred_full, v246, v508, v138, 90); + svfloat32_t v280 = svsub_f32_x(svptrue_b32(), v232, v246); + svfloat32_t v122 = svadd_f32_x(svptrue_b32(), v121, v113); + svfloat32_t zero159 = svdup_n_f32(0); + svfloat32_t v159 = svcmla_f32_x(pred_full, zero159, v494, v124, 90); + svfloat32_t v275 = svmla_f32_x(pred_full, v204, v132, v502); + svfloat32_t v276 = svmla_f32_x(pred_full, v204, v131, v501); + svfloat32_t v281 = svcmla_f32_x(pred_full, v267, v511, v141, 90); + svfloat32_t v282 = svsub_f32_x(svptrue_b32(), v253, v267); + svfloat32_t v301 = svadd_f32_x(svptrue_b32(), v277, v278); + svfloat32_t v268 = svmls_f32_x(pred_full, v122, v113, v493); + svfloat32_t v283 = svadd_f32_x(svptrue_b32(), v273, v275); + svfloat32_t v293 = svadd_f32_x(svptrue_b32(), v159, v279); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v281, v277); + svfloat32_t v297 = svadd_f32_x(svptrue_b32(), v159, v282); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v282, v278); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v301, v279); + svst1_f64(pred_full, (double *)(v520), svreinterpret_f64_f32(v122)); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v283, v268); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v268, v270); + svfloat32_t v287 = svadd_f32_x(svptrue_b32(), v268, v274); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v268, v271); + svfloat32_t v291 = svadd_f32_x(svptrue_b32(), v268, v269); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v293, v281); + svfloat32_t v296 = svsub_f32_x(svptrue_b32(), v295, v159); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v297, v280); + svfloat32_t v300 = svsub_f32_x(svptrue_b32(), v299, v159); + svfloat32_t v303 = svadd_f32_x(svptrue_b32(), v302, v280); + svfloat32_t v286 = svsub_f32_x(svptrue_b32(), v285, v275); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v287, v276); + svfloat32_t v290 = svsub_f32_x(svptrue_b32(), v289, v276); + svfloat32_t v292 = svsub_f32_x(svptrue_b32(), v291, v272); + svfloat32_t v304 = svsub_f32_x(svptrue_b32(), v303, v159); + svfloat32_t v306 = svadd_f32_x(svptrue_b32(), v284, v294); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v284, v294); + svfloat32_t v305 = svadd_f32_x(svptrue_b32(), v292, v304); + svfloat32_t v307 = svadd_f32_x(svptrue_b32(), v286, v296); + svfloat32_t v308 = svsub_f32_x(svptrue_b32(), v288, v298); + svfloat32_t v309 = svadd_f32_x(svptrue_b32(), v290, v300); + svfloat32_t v310 = svsub_f32_x(svptrue_b32(), v290, v300); + svfloat32_t v311 = svadd_f32_x(svptrue_b32(), v288, v298); + svfloat32_t v312 = svsub_f32_x(svptrue_b32(), v286, v296); + svfloat32_t v314 = svsub_f32_x(svptrue_b32(), v292, v304); + svst1_f64(pred_full, (double *)(v538), svreinterpret_f64_f32(v306)); + svst1_f64(pred_full, (double *)(v601), svreinterpret_f64_f32(v313)); + svst1_f64(pred_full, (double *)(v529), svreinterpret_f64_f32(v305)); + svst1_f64(pred_full, (double *)(v547), svreinterpret_f64_f32(v307)); + svst1_f64(pred_full, (double *)(v556), svreinterpret_f64_f32(v308)); + svst1_f64(pred_full, (double *)(v565), svreinterpret_f64_f32(v309)); + svst1_f64(pred_full, (double *)(v574), svreinterpret_f64_f32(v310)); + svst1_f64(pred_full, (double *)(v583), svreinterpret_f64_f32(v311)); + svst1_f64(pred_full, (double *)(v592), svreinterpret_f64_f32(v312)); + svst1_f64(pred_full, (double *)(v610), svreinterpret_f64_f32(v314)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu13(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v20 = vld1s_s16(&v5[istride]); + float v142 = 1.0833333333333333e+00F; + float v146 = -3.0046260628866578e-01F; + float v149 = 7.4927933062613905e-01F; + float v150 = -7.4927933062613905e-01F; + float v156 = 4.0100212832186721e-01F; + float v157 = -4.0100212832186721e-01F; + float v163 = 5.7514072947400308e-01F; + float v164 = -5.7514072947400308e-01F; + float v171 = 5.2422663952658211e-01F; + float v175 = 5.1652078062348972e-01F; + float v179 = 7.7058589030924258e-03F; + float v183 = 4.2763404682656941e-01F; + float v187 = 1.5180597207438440e-01F; + float v191 = 5.7944001890096386e-01F; + float v194 = 1.1543953381323635e+00F; + float v195 = -1.1543953381323635e+00F; + float v201 = 9.0655220171271012e-01F; + float v202 = -9.0655220171271012e-01F; + float v208 = 8.1857027294591811e-01F; + float v209 = -8.1857027294591811e-01F; + float v215 = 1.1971367726043427e+00F; + float v216 = -1.1971367726043427e+00F; + float v222 = 8.6131170741789742e-01F; + float v223 = -8.6131170741789742e-01F; + float v229 = 1.1091548438375507e+00F; + float v230 = -1.1091548438375507e+00F; + float v236 = 4.2741434471979367e-02F; + float v237 = -4.2741434471979367e-02F; + float v243 = -4.5240494294812715e-02F; + float v244 = 4.5240494294812715e-02F; + float v250 = 2.9058457089163264e-01F; + float v251 = -2.9058457089163264e-01F; + float32x2_t v253 = (float32x2_t){v4, v4}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v127 = vld1s_s16(&v5[0]); + float32x2_t v143 = (float32x2_t){v142, v142}; + float32x2_t v147 = (float32x2_t){v146, v146}; + float32x2_t v151 = (float32x2_t){v149, v150}; + float32x2_t v158 = (float32x2_t){v156, v157}; + float32x2_t v165 = (float32x2_t){v163, v164}; + float32x2_t v172 = (float32x2_t){v171, v171}; + float32x2_t v176 = (float32x2_t){v175, v175}; + float32x2_t v180 = (float32x2_t){v179, v179}; + float32x2_t v184 = (float32x2_t){v183, v183}; + float32x2_t v188 = (float32x2_t){v187, v187}; + float32x2_t v192 = (float32x2_t){v191, v191}; + float32x2_t v196 = (float32x2_t){v194, v195}; + float32x2_t v203 = (float32x2_t){v201, v202}; + float32x2_t v210 = (float32x2_t){v208, v209}; + float32x2_t v217 = (float32x2_t){v215, v216}; + float32x2_t v224 = (float32x2_t){v222, v223}; + float32x2_t v231 = (float32x2_t){v229, v230}; + float32x2_t v238 = (float32x2_t){v236, v237}; + float32x2_t v245 = (float32x2_t){v243, v244}; + float32x2_t v252 = (float32x2_t){v250, v251}; + int16x4_t v26 = vld1s_s16(&v5[istride * 12]); + int16x4_t v33 = vld1s_s16(&v5[istride * 2]); + int16x4_t v39 = vld1s_s16(&v5[istride * 11]); + int16x4_t v46 = vld1s_s16(&v5[istride * 3]); + int16x4_t v52 = vld1s_s16(&v5[istride * 10]); + int16x4_t v59 = vld1s_s16(&v5[istride * 4]); + int16x4_t v65 = vld1s_s16(&v5[istride * 9]); + int16x4_t v72 = vld1s_s16(&v5[istride * 5]); + int16x4_t v78 = vld1s_s16(&v5[istride * 8]); + int16x4_t v85 = vld1s_s16(&v5[istride * 6]); + int16x4_t v91 = vld1s_s16(&v5[istride * 7]); + float32x2_t v128 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v127)), 15); + float32x2_t v153 = vmul_f32(v253, v151); + float32x2_t v160 = vmul_f32(v253, v158); + float32x2_t v167 = vmul_f32(v253, v165); + float32x2_t v198 = vmul_f32(v253, v196); + float32x2_t v205 = vmul_f32(v253, v203); + float32x2_t v212 = vmul_f32(v253, v210); + float32x2_t v219 = vmul_f32(v253, v217); + float32x2_t v226 = vmul_f32(v253, v224); + float32x2_t v233 = vmul_f32(v253, v231); + float32x2_t v240 = vmul_f32(v253, v238); + float32x2_t v247 = vmul_f32(v253, v245); + float32x2_t v254 = vmul_f32(v253, v252); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v34 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v33)), 15); + float32x2_t v40 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v39)), 15); + float32x2_t v47 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v46)), 15); + float32x2_t v53 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v52)), 15); + float32x2_t v60 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v59)), 15); + float32x2_t v66 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v65)), 15); + float32x2_t v73 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v72)), 15); + float32x2_t v79 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v78)), 15); + float32x2_t v86 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v85)), 15); + float32x2_t v92 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v91)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v41 = vadd_f32(v34, v40); + float32x2_t v54 = vadd_f32(v47, v53); + float32x2_t v67 = vadd_f32(v60, v66); + float32x2_t v80 = vadd_f32(v73, v79); + float32x2_t v93 = vadd_f32(v86, v92); + float32x2_t v94 = vsub_f32(v21, v27); + float32x2_t v95 = vsub_f32(v34, v40); + float32x2_t v96 = vsub_f32(v47, v53); + float32x2_t v97 = vsub_f32(v60, v66); + float32x2_t v98 = vsub_f32(v73, v79); + float32x2_t v99 = vsub_f32(v86, v92); + float32x2_t v100 = vadd_f32(v41, v80); + float32x2_t v102 = vadd_f32(v28, v54); + float32x2_t v105 = vadd_f32(v95, v98); + float32x2_t v107 = vadd_f32(v94, v96); + float32x2_t v109 = vsub_f32(v41, v93); + float32x2_t v110 = vsub_f32(v54, v67); + float32x2_t v111 = vsub_f32(v28, v67); + float32x2_t v112 = vsub_f32(v80, v93); + float32x2_t v117 = vsub_f32(v95, v99); + float32x2_t v118 = vsub_f32(v94, v96); + float32x2_t v119 = vsub_f32(v95, v98); + float32x2_t v120 = vadd_f32(v94, v97); + float32x2_t v121 = vsub_f32(v98, v99); + float32x2_t v122 = vadd_f32(v96, v97); + float32x2_t v101 = vadd_f32(v100, v93); + float32x2_t v103 = vadd_f32(v102, v67); + float32x2_t v106 = vadd_f32(v105, v99); + float32x2_t v108 = vsub_f32(v107, v97); + float32x2_t v113 = vsub_f32(v109, v110); + float32x2_t v114 = vsub_f32(v111, v112); + float32x2_t v115 = vadd_f32(v109, v110); + float32x2_t v116 = vadd_f32(v111, v112); + float32x2_t v134 = vadd_f32(v117, v118); + float32x2_t v135 = vadd_f32(v119, v120); + float32x2_t v136 = vsub_f32(v121, v122); + float32x2_t v199 = vrev64_f32(v117); + float32x2_t v206 = vrev64_f32(v118); + float32x2_t v220 = vrev64_f32(v119); + float32x2_t v227 = vrev64_f32(v120); + float32x2_t v241 = vrev64_f32(v121); + float32x2_t v248 = vrev64_f32(v122); + float32x2_t v104 = vadd_f32(v101, v103); + float32x2_t v130 = vsub_f32(v103, v101); + float32x2_t v131 = vadd_f32(v106, v108); + float32x2_t v132 = vadd_f32(v113, v114); + float32x2_t v133 = vsub_f32(v115, v116); + float32x2_t v154 = vrev64_f32(v106); + float32x2_t v161 = vrev64_f32(v108); + float32x2_t v173 = vmul_f32(v113, v172); + float32x2_t v177 = vmul_f32(v114, v176); + float32x2_t v185 = vmul_f32(v115, v184); + float32x2_t v189 = vmul_f32(v116, v188); + float32x2_t v200 = vmul_f32(v199, v198); + float32x2_t v207 = vmul_f32(v206, v205); + float32x2_t v213 = vrev64_f32(v134); + float32x2_t v221 = vmul_f32(v220, v219); + float32x2_t v228 = vmul_f32(v227, v226); + float32x2_t v234 = vrev64_f32(v135); + float32x2_t v242 = vmul_f32(v241, v240); + float32x2_t v249 = vmul_f32(v248, v247); + float32x2_t v255 = vrev64_f32(v136); + float32x2_t v129 = vadd_f32(v128, v104); + float32x2_t v144 = vmul_f32(v104, v143); + float32x2_t v148 = vmul_f32(v130, v147); + float32x2_t v155 = vmul_f32(v154, v153); + float32x2_t v162 = vmul_f32(v161, v160); + float32x2_t v168 = vrev64_f32(v131); + float32x2_t v181 = vmul_f32(v132, v180); + float32x2_t v193 = vmul_f32(v133, v192); + float32x2_t v214 = vmul_f32(v213, v212); + float32x2_t v235 = vmul_f32(v234, v233); + float32x2_t v256 = vmul_f32(v255, v254); + float32x2_t v258 = vadd_f32(v177, v173); + float32x2_t v169 = vmul_f32(v168, v167); + float32x2_t v257 = vsub_f32(v129, v144); + float32x2_t v259 = vsub_f32(v258, v148); + float32x2_t v260 = vadd_f32(v177, v181); + float32x2_t v262 = vsub_f32(v181, v173); + float32x2_t v270 = vsub_f32(v200, v214); + float32x2_t v271 = vsub_f32(v207, v214); + float32x2_t v272 = vsub_f32(v221, v235); + float32x2_t v273 = vsub_f32(v228, v235); + float32x2_t v274 = vsub_f32(v242, v256); + float32x2_t v275 = vadd_f32(v249, v256); + v6[0] = v129; + float32x2_t v261 = vadd_f32(v260, v148); + float32x2_t v263 = vsub_f32(v262, v148); + float32x2_t v264 = vadd_f32(v257, v185); + float32x2_t v266 = vsub_f32(v257, v189); + float32x2_t v268 = vsub_f32(v257, v185); + float32x2_t v276 = vsub_f32(v155, v169); + float32x2_t v277 = vsub_f32(v162, v169); + float32x2_t v288 = vadd_f32(v270, v274); + float32x2_t v290 = vadd_f32(v272, v274); + float32x2_t v292 = vsub_f32(v271, v275); + float32x2_t v265 = vadd_f32(v264, v189); + float32x2_t v267 = vsub_f32(v266, v193); + float32x2_t v269 = vadd_f32(v268, v193); + float32x2_t v284 = vsub_f32(v277, v270); + float32x2_t v286 = vsub_f32(v275, v276); + float32x2_t v289 = vadd_f32(v288, v277); + float32x2_t v291 = vsub_f32(v290, v277); + float32x2_t v293 = vsub_f32(v292, v276); + float32x2_t v294 = vadd_f32(v276, v271); + float32x2_t v278 = vadd_f32(v259, v265); + float32x2_t v279 = vadd_f32(v261, v267); + float32x2_t v280 = vsub_f32(v267, v261); + float32x2_t v281 = vadd_f32(v263, v269); + float32x2_t v282 = vsub_f32(v265, v259); + float32x2_t v283 = vsub_f32(v269, v263); + float32x2_t v285 = vadd_f32(v284, v272); + float32x2_t v287 = vsub_f32(v286, v273); + float32x2_t v295 = vsub_f32(v294, v273); + float32x2_t v296 = vsub_f32(v278, v285); + float32x2_t v297 = vadd_f32(v279, v287); + float32x2_t v298 = vsub_f32(v280, v289); + float32x2_t v299 = vsub_f32(v281, v291); + float32x2_t v300 = vadd_f32(v282, v293); + float32x2_t v301 = vsub_f32(v283, v295); + float32x2_t v302 = vadd_f32(v283, v295); + float32x2_t v303 = vsub_f32(v282, v293); + float32x2_t v304 = vadd_f32(v281, v291); + float32x2_t v305 = vadd_f32(v280, v289); + float32x2_t v306 = vsub_f32(v279, v287); + float32x2_t v307 = vadd_f32(v278, v285); + v6[ostride * 12] = v296; + v6[ostride * 11] = v297; + v6[ostride * 10] = v298; + v6[ostride * 9] = v299; + v6[ostride * 8] = v300; + v6[ostride * 7] = v301; + v6[ostride * 6] = v302; + v6[ostride * 5] = v303; + v6[ostride * 4] = v304; + v6[ostride * 3] = v305; + v6[ostride * 2] = v306; + v6[ostride] = v307; + v5 += 1 * idist; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu13(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v171 = 1.0833333333333333e+00F; + float v176 = -3.0046260628866578e-01F; + float v181 = -7.4927933062613905e-01F; + float v188 = -4.0100212832186721e-01F; + float v195 = -5.7514072947400308e-01F; + float v202 = 5.2422663952658211e-01F; + float v207 = 5.1652078062348972e-01F; + float v212 = 7.7058589030924258e-03F; + float v217 = 4.2763404682656941e-01F; + float v222 = 1.5180597207438440e-01F; + float v227 = 5.7944001890096386e-01F; + float v232 = -1.1543953381323635e+00F; + float v239 = -9.0655220171271012e-01F; + float v246 = -8.1857027294591811e-01F; + float v253 = -1.1971367726043427e+00F; + float v260 = -8.6131170741789742e-01F; + float v267 = -1.1091548438375507e+00F; + float v274 = -4.2741434471979367e-02F; + float v281 = 4.5240494294812715e-02F; + float v288 = -2.9058457089163264e-01F; + const int32_t *v442 = &v5[v0]; + float32x2_t *v690 = &v6[v2]; + int64_t v27 = v0 * 12; + int64_t v36 = v0 * 2; + int64_t v44 = v0 * 11; + int64_t v53 = v0 * 3; + int64_t v61 = v0 * 10; + int64_t v70 = v0 * 4; + int64_t v78 = v0 * 9; + int64_t v87 = v0 * 5; + int64_t v95 = v0 * 8; + int64_t v104 = v0 * 6; + int64_t v112 = v0 * 7; + float v184 = v4 * v181; + float v191 = v4 * v188; + float v198 = v4 * v195; + float v235 = v4 * v232; + float v242 = v4 * v239; + float v249 = v4 * v246; + float v256 = v4 * v253; + float v263 = v4 * v260; + float v270 = v4 * v267; + float v277 = v4 * v274; + float v284 = v4 * v281; + float v291 = v4 * v288; + int64_t v353 = v2 * 12; + int64_t v360 = v2 * 11; + int64_t v367 = v2 * 10; + int64_t v374 = v2 * 9; + int64_t v381 = v2 * 8; + int64_t v388 = v2 * 7; + int64_t v395 = v2 * 6; + int64_t v402 = v2 * 5; + int64_t v409 = v2 * 4; + int64_t v416 = v2 * 3; + int64_t v423 = v2 * 2; + const int32_t *v551 = &v5[0]; + svint64_t v552 = svindex_s64(0, v1); + svfloat32_t v555 = svdup_n_f32(v171); + svfloat32_t v556 = svdup_n_f32(v176); + svfloat32_t v560 = svdup_n_f32(v202); + svfloat32_t v561 = svdup_n_f32(v207); + svfloat32_t v562 = svdup_n_f32(v212); + svfloat32_t v563 = svdup_n_f32(v217); + svfloat32_t v564 = svdup_n_f32(v222); + svfloat32_t v565 = svdup_n_f32(v227); + float32x2_t *v582 = &v6[0]; + svint16_t v444 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v442), v552)); + const int32_t *v451 = &v5[v27]; + const int32_t *v460 = &v5[v36]; + const int32_t *v469 = &v5[v44]; + const int32_t *v478 = &v5[v53]; + const int32_t *v487 = &v5[v61]; + const int32_t *v496 = &v5[v70]; + const int32_t *v505 = &v5[v78]; + const int32_t *v514 = &v5[v87]; + const int32_t *v523 = &v5[v95]; + const int32_t *v532 = &v5[v104]; + const int32_t *v541 = &v5[v112]; + svint16_t v553 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v551), v552)); + svfloat32_t v557 = svdup_n_f32(v184); + svfloat32_t v558 = svdup_n_f32(v191); + svfloat32_t v559 = svdup_n_f32(v198); + svfloat32_t v566 = svdup_n_f32(v235); + svfloat32_t v567 = svdup_n_f32(v242); + svfloat32_t v568 = svdup_n_f32(v249); + svfloat32_t v569 = svdup_n_f32(v256); + svfloat32_t v570 = svdup_n_f32(v263); + svfloat32_t v571 = svdup_n_f32(v270); + svfloat32_t v572 = svdup_n_f32(v277); + svfloat32_t v573 = svdup_n_f32(v284); + svfloat32_t v574 = svdup_n_f32(v291); + float32x2_t *v591 = &v6[v353]; + float32x2_t *v600 = &v6[v360]; + float32x2_t *v609 = &v6[v367]; + float32x2_t *v618 = &v6[v374]; + float32x2_t *v627 = &v6[v381]; + float32x2_t *v636 = &v6[v388]; + float32x2_t *v645 = &v6[v395]; + float32x2_t *v654 = &v6[v402]; + float32x2_t *v663 = &v6[v409]; + float32x2_t *v672 = &v6[v416]; + float32x2_t *v681 = &v6[v423]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v444, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v156 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v553, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svint16_t v453 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v451), v552)); + svint16_t v462 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v460), v552)); + svint16_t v471 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v469), v552)); + svint16_t v480 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v478), v552)); + svint16_t v489 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v487), v552)); + svint16_t v498 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v496), v552)); + svint16_t v507 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v505), v552)); + svint16_t v516 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v514), v552)); + svint16_t v525 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v523), v552)); + svint16_t v534 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v532), v552)); + svint16_t v543 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v541), v552)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v453, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v42 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v462, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v50 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v471, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v59 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v480, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v67 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v489, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v76 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v498, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v84 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v507, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v93 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v516, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v101 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v525, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v110 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v534, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v118 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v543, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v51 = svadd_f32_x(svptrue_b32(), v42, v50); + svfloat32_t v68 = svadd_f32_x(svptrue_b32(), v59, v67); + svfloat32_t v85 = svadd_f32_x(svptrue_b32(), v76, v84); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v93, v101); + svfloat32_t v119 = svadd_f32_x(svptrue_b32(), v110, v118); + svfloat32_t v120 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v121 = svsub_f32_x(svptrue_b32(), v42, v50); + svfloat32_t v122 = svsub_f32_x(svptrue_b32(), v59, v67); + svfloat32_t v123 = svsub_f32_x(svptrue_b32(), v76, v84); + svfloat32_t v124 = svsub_f32_x(svptrue_b32(), v93, v101); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v110, v118); + svfloat32_t v126 = svadd_f32_x(svptrue_b32(), v51, v102); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v34, v68); + svfloat32_t v131 = svadd_f32_x(svptrue_b32(), v121, v124); + svfloat32_t v133 = svadd_f32_x(svptrue_b32(), v120, v122); + svfloat32_t v135 = svsub_f32_x(svptrue_b32(), v51, v119); + svfloat32_t v136 = svsub_f32_x(svptrue_b32(), v68, v85); + svfloat32_t v137 = svsub_f32_x(svptrue_b32(), v34, v85); + svfloat32_t v138 = svsub_f32_x(svptrue_b32(), v102, v119); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v121, v125); + svfloat32_t v144 = svsub_f32_x(svptrue_b32(), v120, v122); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v121, v124); + svfloat32_t v146 = svadd_f32_x(svptrue_b32(), v120, v123); + svfloat32_t v147 = svsub_f32_x(svptrue_b32(), v124, v125); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v122, v123); + svfloat32_t v127 = svadd_f32_x(svptrue_b32(), v126, v119); + svfloat32_t v129 = svadd_f32_x(svptrue_b32(), v128, v85); + svfloat32_t v132 = svadd_f32_x(svptrue_b32(), v131, v125); + svfloat32_t v134 = svsub_f32_x(svptrue_b32(), v133, v123); + svfloat32_t v139 = svsub_f32_x(svptrue_b32(), v135, v136); + svfloat32_t v140 = svsub_f32_x(svptrue_b32(), v137, v138); + svfloat32_t v141 = svadd_f32_x(svptrue_b32(), v135, v136); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v137, v138); + svfloat32_t v162 = svadd_f32_x(svptrue_b32(), v143, v144); + svfloat32_t v163 = svadd_f32_x(svptrue_b32(), v145, v146); + svfloat32_t v164 = svsub_f32_x(svptrue_b32(), v147, v148); + svfloat32_t zero237 = svdup_n_f32(0); + svfloat32_t v237 = svcmla_f32_x(pred_full, zero237, v566, v143, 90); + svfloat32_t zero244 = svdup_n_f32(0); + svfloat32_t v244 = svcmla_f32_x(pred_full, zero244, v567, v144, 90); + svfloat32_t zero258 = svdup_n_f32(0); + svfloat32_t v258 = svcmla_f32_x(pred_full, zero258, v569, v145, 90); + svfloat32_t zero265 = svdup_n_f32(0); + svfloat32_t v265 = svcmla_f32_x(pred_full, zero265, v570, v146, 90); + svfloat32_t zero279 = svdup_n_f32(0); + svfloat32_t v279 = svcmla_f32_x(pred_full, zero279, v572, v147, 90); + svfloat32_t v130 = svadd_f32_x(svptrue_b32(), v127, v129); + svfloat32_t v158 = svsub_f32_x(svptrue_b32(), v129, v127); + svfloat32_t v159 = svadd_f32_x(svptrue_b32(), v132, v134); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v139, v140); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v141, v142); + svfloat32_t zero186 = svdup_n_f32(0); + svfloat32_t v186 = svcmla_f32_x(pred_full, zero186, v557, v132, 90); + svfloat32_t zero193 = svdup_n_f32(0); + svfloat32_t v193 = svcmla_f32_x(pred_full, zero193, v558, v134, 90); + svfloat32_t v205 = svmul_f32_x(svptrue_b32(), v139, v560); + svfloat32_t zero251 = svdup_n_f32(0); + svfloat32_t v251 = svcmla_f32_x(pred_full, zero251, v568, v162, 90); + svfloat32_t zero272 = svdup_n_f32(0); + svfloat32_t v272 = svcmla_f32_x(pred_full, zero272, v571, v163, 90); + svfloat32_t zero293 = svdup_n_f32(0); + svfloat32_t v293 = svcmla_f32_x(pred_full, zero293, v574, v164, 90); + svfloat32_t v157 = svadd_f32_x(svptrue_b32(), v156, v130); + svfloat32_t zero200 = svdup_n_f32(0); + svfloat32_t v200 = svcmla_f32_x(pred_full, zero200, v559, v159, 90); + svfloat32_t v215 = svmul_f32_x(svptrue_b32(), v160, v562); + svfloat32_t v295 = svmla_f32_x(pred_full, v205, v140, v561); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v237, v251); + svfloat32_t v308 = svsub_f32_x(svptrue_b32(), v244, v251); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v258, v272); + svfloat32_t v310 = svsub_f32_x(svptrue_b32(), v265, v272); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v279, v293); + svfloat32_t v312 = svcmla_f32_x(pred_full, v293, v573, v148, 90); + svfloat32_t v294 = svmls_f32_x(pred_full, v157, v130, v555); + svfloat32_t v296 = svmls_f32_x(pred_full, v295, v158, v556); + svfloat32_t v297 = svmla_f32_x(pred_full, v215, v140, v561); + svfloat32_t v299 = svnmls_f32_x(pred_full, v205, v160, v562); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v186, v200); + svfloat32_t v314 = svsub_f32_x(svptrue_b32(), v193, v200); + svfloat32_t v325 = svadd_f32_x(svptrue_b32(), v307, v311); + svfloat32_t v327 = svadd_f32_x(svptrue_b32(), v309, v311); + svfloat32_t v329 = svsub_f32_x(svptrue_b32(), v308, v312); + svst1_f64(pred_full, (double *)(v582), svreinterpret_f64_f32(v157)); + svfloat32_t v298 = svmla_f32_x(pred_full, v297, v158, v556); + svfloat32_t v300 = svmls_f32_x(pred_full, v299, v158, v556); + svfloat32_t v301 = svmla_f32_x(pred_full, v294, v141, v563); + svfloat32_t v303 = svmls_f32_x(pred_full, v294, v142, v564); + svfloat32_t v305 = svmls_f32_x(pred_full, v294, v141, v563); + svfloat32_t v321 = svsub_f32_x(svptrue_b32(), v314, v307); + svfloat32_t v323 = svsub_f32_x(svptrue_b32(), v312, v313); + svfloat32_t v326 = svadd_f32_x(svptrue_b32(), v325, v314); + svfloat32_t v328 = svsub_f32_x(svptrue_b32(), v327, v314); + svfloat32_t v330 = svsub_f32_x(svptrue_b32(), v329, v313); + svfloat32_t v331 = svadd_f32_x(svptrue_b32(), v313, v308); + svfloat32_t v302 = svmla_f32_x(pred_full, v301, v142, v564); + svfloat32_t v304 = svmls_f32_x(pred_full, v303, v161, v565); + svfloat32_t v306 = svmla_f32_x(pred_full, v305, v161, v565); + svfloat32_t v322 = svadd_f32_x(svptrue_b32(), v321, v309); + svfloat32_t v324 = svsub_f32_x(svptrue_b32(), v323, v310); + svfloat32_t v332 = svsub_f32_x(svptrue_b32(), v331, v310); + svfloat32_t v315 = svadd_f32_x(svptrue_b32(), v296, v302); + svfloat32_t v316 = svadd_f32_x(svptrue_b32(), v298, v304); + svfloat32_t v317 = svsub_f32_x(svptrue_b32(), v304, v298); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v300, v306); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v302, v296); + svfloat32_t v320 = svsub_f32_x(svptrue_b32(), v306, v300); + svfloat32_t v333 = svsub_f32_x(svptrue_b32(), v315, v322); + svfloat32_t v334 = svadd_f32_x(svptrue_b32(), v316, v324); + svfloat32_t v335 = svsub_f32_x(svptrue_b32(), v317, v326); + svfloat32_t v336 = svsub_f32_x(svptrue_b32(), v318, v328); + svfloat32_t v337 = svadd_f32_x(svptrue_b32(), v319, v330); + svfloat32_t v338 = svsub_f32_x(svptrue_b32(), v320, v332); + svfloat32_t v339 = svadd_f32_x(svptrue_b32(), v320, v332); + svfloat32_t v340 = svsub_f32_x(svptrue_b32(), v319, v330); + svfloat32_t v341 = svadd_f32_x(svptrue_b32(), v318, v328); + svfloat32_t v342 = svadd_f32_x(svptrue_b32(), v317, v326); + svfloat32_t v343 = svsub_f32_x(svptrue_b32(), v316, v324); + svfloat32_t v344 = svadd_f32_x(svptrue_b32(), v315, v322); + svst1_f64(pred_full, (double *)(v591), svreinterpret_f64_f32(v333)); + svst1_f64(pred_full, (double *)(v600), svreinterpret_f64_f32(v334)); + svst1_f64(pred_full, (double *)(v609), svreinterpret_f64_f32(v335)); + svst1_f64(pred_full, (double *)(v618), svreinterpret_f64_f32(v336)); + svst1_f64(pred_full, (double *)(v627), svreinterpret_f64_f32(v337)); + svst1_f64(pred_full, (double *)(v636), svreinterpret_f64_f32(v338)); + svst1_f64(pred_full, (double *)(v645), svreinterpret_f64_f32(v339)); + svst1_f64(pred_full, (double *)(v654), svreinterpret_f64_f32(v340)); + svst1_f64(pred_full, (double *)(v663), svreinterpret_f64_f32(v341)); + svst1_f64(pred_full, (double *)(v672), svreinterpret_f64_f32(v342)); + svst1_f64(pred_full, (double *)(v681), svreinterpret_f64_f32(v343)); + svst1_f64(pred_full, (double *)(v690), svreinterpret_f64_f32(v344)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu14(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v82 = vld1s_s16(&v5[istride]); + float v220 = -1.1666666666666665e+00F; + float v224 = 7.9015646852540022e-01F; + float v228 = 5.5854267289647742e-02F; + float v232 = 7.3430220123575241e-01F; + float v235 = 4.4095855184409838e-01F; + float v236 = -4.4095855184409838e-01F; + float v242 = 3.4087293062393137e-01F; + float v243 = -3.4087293062393137e-01F; + float v249 = -5.3396936033772524e-01F; + float v250 = 5.3396936033772524e-01F; + float v256 = 8.7484229096165667e-01F; + float v257 = -8.7484229096165667e-01F; + float32x2_t v259 = (float32x2_t){v4, v4}; + int16x4_t v20 = vld1s_s16(&v5[0]); + float32x2_t v83 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v82)), 15); + float32x2_t v221 = (float32x2_t){v220, v220}; + float32x2_t v225 = (float32x2_t){v224, v224}; + float32x2_t v229 = (float32x2_t){v228, v228}; + float32x2_t v233 = (float32x2_t){v232, v232}; + float32x2_t v237 = (float32x2_t){v235, v236}; + float32x2_t v244 = (float32x2_t){v242, v243}; + float32x2_t v251 = (float32x2_t){v249, v250}; + float32x2_t v258 = (float32x2_t){v256, v257}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v26 = vld1s_s16(&v5[istride * 7]); + int16x4_t v34 = vld1s_s16(&v5[istride * 2]); + int16x4_t v40 = vld1s_s16(&v5[istride * 9]); + int16x4_t v48 = vld1s_s16(&v5[istride * 4]); + int16x4_t v54 = vld1s_s16(&v5[istride * 11]); + int16x4_t v62 = vld1s_s16(&v5[istride * 6]); + int16x4_t v68 = vld1s_s16(&v5[istride * 13]); + int16x4_t v76 = vld1s_s16(&v5[istride * 8]); + int16x4_t v90 = vld1s_s16(&v5[istride * 10]); + int16x4_t v96 = vld1s_s16(&v5[istride * 3]); + int16x4_t v104 = vld1s_s16(&v5[istride * 12]); + int16x4_t v110 = vld1s_s16(&v5[istride * 5]); + float32x2_t v239 = vmul_f32(v259, v237); + float32x2_t v246 = vmul_f32(v259, v244); + float32x2_t v253 = vmul_f32(v259, v251); + float32x2_t v260 = vmul_f32(v259, v258); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v49 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v48)), 15); + float32x2_t v55 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v54)), 15); + float32x2_t v63 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v62)), 15); + float32x2_t v69 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v68)), 15); + float32x2_t v77 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v76)), 15); + float32x2_t v91 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v90)), 15); + float32x2_t v97 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v96)), 15); + float32x2_t v105 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v104)), 15); + float32x2_t v111 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v110)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v56 = vadd_f32(v49, v55); + float32x2_t v57 = vsub_f32(v49, v55); + float32x2_t v70 = vadd_f32(v63, v69); + float32x2_t v71 = vsub_f32(v63, v69); + float32x2_t v84 = vadd_f32(v77, v83); + float32x2_t v85 = vsub_f32(v77, v83); + float32x2_t v98 = vadd_f32(v91, v97); + float32x2_t v99 = vsub_f32(v91, v97); + float32x2_t v112 = vadd_f32(v105, v111); + float32x2_t v113 = vsub_f32(v105, v111); + float32x2_t v114 = vadd_f32(v42, v112); + float32x2_t v115 = vsub_f32(v42, v112); + float32x2_t v116 = vadd_f32(v84, v70); + float32x2_t v117 = vsub_f32(v84, v70); + float32x2_t v118 = vadd_f32(v56, v98); + float32x2_t v119 = vsub_f32(v56, v98); + float32x2_t v198 = vadd_f32(v43, v113); + float32x2_t v199 = vsub_f32(v43, v113); + float32x2_t v200 = vadd_f32(v85, v71); + float32x2_t v201 = vsub_f32(v85, v71); + float32x2_t v202 = vadd_f32(v57, v99); + float32x2_t v203 = vsub_f32(v57, v99); + float32x2_t v120 = vadd_f32(v114, v116); + float32x2_t v123 = vsub_f32(v114, v116); + float32x2_t v124 = vsub_f32(v116, v118); + float32x2_t v125 = vsub_f32(v118, v114); + float32x2_t v126 = vadd_f32(v115, v117); + float32x2_t v128 = vsub_f32(v115, v117); + float32x2_t v129 = vsub_f32(v117, v119); + float32x2_t v130 = vsub_f32(v119, v115); + float32x2_t v204 = vadd_f32(v198, v200); + float32x2_t v207 = vsub_f32(v198, v200); + float32x2_t v208 = vsub_f32(v200, v202); + float32x2_t v209 = vsub_f32(v202, v198); + float32x2_t v210 = vadd_f32(v199, v201); + float32x2_t v212 = vsub_f32(v199, v201); + float32x2_t v213 = vsub_f32(v201, v203); + float32x2_t v214 = vsub_f32(v203, v199); + float32x2_t v121 = vadd_f32(v120, v118); + float32x2_t v127 = vadd_f32(v126, v119); + float32x2_t v142 = vmul_f32(v123, v225); + float32x2_t v146 = vmul_f32(v124, v229); + float32x2_t v150 = vmul_f32(v125, v233); + float32x2_t v163 = vrev64_f32(v128); + float32x2_t v170 = vrev64_f32(v129); + float32x2_t v177 = vrev64_f32(v130); + float32x2_t v205 = vadd_f32(v204, v202); + float32x2_t v211 = vadd_f32(v210, v203); + float32x2_t v226 = vmul_f32(v207, v225); + float32x2_t v230 = vmul_f32(v208, v229); + float32x2_t v234 = vmul_f32(v209, v233); + float32x2_t v247 = vrev64_f32(v212); + float32x2_t v254 = vrev64_f32(v213); + float32x2_t v261 = vrev64_f32(v214); + float32x2_t v122 = vadd_f32(v121, v28); + float32x2_t v138 = vmul_f32(v121, v221); + float32x2_t v156 = vrev64_f32(v127); + float32x2_t v164 = vmul_f32(v163, v246); + float32x2_t v171 = vmul_f32(v170, v253); + float32x2_t v178 = vmul_f32(v177, v260); + float32x2_t v206 = vadd_f32(v205, v29); + float32x2_t v222 = vmul_f32(v205, v221); + float32x2_t v240 = vrev64_f32(v211); + float32x2_t v248 = vmul_f32(v247, v246); + float32x2_t v255 = vmul_f32(v254, v253); + float32x2_t v262 = vmul_f32(v261, v260); + float32x2_t v157 = vmul_f32(v156, v239); + float32x2_t v179 = vadd_f32(v122, v138); + float32x2_t v241 = vmul_f32(v240, v239); + float32x2_t v263 = vadd_f32(v206, v222); + v6[0] = v122; + v6[ostride * 7] = v206; + float32x2_t v180 = vadd_f32(v179, v142); + float32x2_t v182 = vsub_f32(v179, v142); + float32x2_t v184 = vsub_f32(v179, v146); + float32x2_t v186 = vadd_f32(v157, v164); + float32x2_t v188 = vsub_f32(v157, v164); + float32x2_t v190 = vsub_f32(v157, v171); + float32x2_t v264 = vadd_f32(v263, v226); + float32x2_t v266 = vsub_f32(v263, v226); + float32x2_t v268 = vsub_f32(v263, v230); + float32x2_t v270 = vadd_f32(v241, v248); + float32x2_t v272 = vsub_f32(v241, v248); + float32x2_t v274 = vsub_f32(v241, v255); + float32x2_t v181 = vadd_f32(v180, v146); + float32x2_t v183 = vsub_f32(v182, v150); + float32x2_t v185 = vadd_f32(v184, v150); + float32x2_t v187 = vadd_f32(v186, v171); + float32x2_t v189 = vsub_f32(v188, v178); + float32x2_t v191 = vadd_f32(v190, v178); + float32x2_t v265 = vadd_f32(v264, v230); + float32x2_t v267 = vsub_f32(v266, v234); + float32x2_t v269 = vadd_f32(v268, v234); + float32x2_t v271 = vadd_f32(v270, v255); + float32x2_t v273 = vsub_f32(v272, v262); + float32x2_t v275 = vadd_f32(v274, v262); + float32x2_t v192 = vadd_f32(v181, v187); + float32x2_t v193 = vsub_f32(v181, v187); + float32x2_t v194 = vadd_f32(v183, v189); + float32x2_t v195 = vsub_f32(v183, v189); + float32x2_t v196 = vadd_f32(v185, v191); + float32x2_t v197 = vsub_f32(v185, v191); + float32x2_t v276 = vadd_f32(v265, v271); + float32x2_t v277 = vsub_f32(v265, v271); + float32x2_t v278 = vadd_f32(v267, v273); + float32x2_t v279 = vsub_f32(v267, v273); + float32x2_t v280 = vadd_f32(v269, v275); + float32x2_t v281 = vsub_f32(v269, v275); + v6[ostride * 8] = v193; + v6[ostride] = v277; + v6[ostride * 2] = v195; + v6[ostride * 9] = v279; + v6[ostride * 10] = v196; + v6[ostride * 3] = v280; + v6[ostride * 4] = v197; + v6[ostride * 11] = v281; + v6[ostride * 12] = v194; + v6[ostride * 5] = v278; + v6[ostride * 6] = v192; + v6[ostride * 13] = v276; + v5 += 1 * idist; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu14(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v256 = -1.1666666666666665e+00F; + float v261 = 7.9015646852540022e-01F; + float v266 = 5.5854267289647742e-02F; + float v271 = 7.3430220123575241e-01F; + float v276 = -4.4095855184409838e-01F; + float v283 = -3.4087293062393137e-01F; + float v290 = 5.3396936033772524e-01F; + float v297 = -8.7484229096165667e-01F; + const int32_t *v508 = &v5[v0]; + float32x2_t *v599 = &v6[v2]; + int64_t v27 = v0 * 7; + int64_t v37 = v0 * 2; + int64_t v45 = v0 * 9; + int64_t v55 = v0 * 4; + int64_t v63 = v0 * 11; + int64_t v73 = v0 * 6; + int64_t v81 = v0 * 13; + int64_t v91 = v0 * 8; + int64_t v109 = v0 * 10; + int64_t v117 = v0 * 3; + int64_t v127 = v0 * 12; + int64_t v135 = v0 * 5; + float v279 = v4 * v276; + float v286 = v4 * v283; + float v293 = v4 * v290; + float v300 = v4 * v297; + int64_t v330 = v2 * 7; + int64_t v337 = v2 * 8; + int64_t v351 = v2 * 2; + int64_t v358 = v2 * 9; + int64_t v365 = v2 * 10; + int64_t v372 = v2 * 3; + int64_t v379 = v2 * 4; + int64_t v386 = v2 * 11; + int64_t v393 = v2 * 12; + int64_t v400 = v2 * 5; + int64_t v407 = v2 * 6; + int64_t v414 = v2 * 13; + const int32_t *v427 = &v5[0]; + svint64_t v545 = svindex_s64(0, v1); + svfloat32_t v557 = svdup_n_f32(v256); + svfloat32_t v558 = svdup_n_f32(v261); + svfloat32_t v559 = svdup_n_f32(v266); + svfloat32_t v560 = svdup_n_f32(v271); + float32x2_t *v572 = &v6[0]; + svint16_t v429 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v427), v545)); + const int32_t *v436 = &v5[v27]; + const int32_t *v445 = &v5[v37]; + const int32_t *v454 = &v5[v45]; + const int32_t *v463 = &v5[v55]; + const int32_t *v472 = &v5[v63]; + const int32_t *v481 = &v5[v73]; + const int32_t *v490 = &v5[v81]; + const int32_t *v499 = &v5[v91]; + svint16_t v510 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v508), v545)); + const int32_t *v517 = &v5[v109]; + const int32_t *v526 = &v5[v117]; + const int32_t *v535 = &v5[v127]; + const int32_t *v544 = &v5[v135]; + svfloat32_t v561 = svdup_n_f32(v279); + svfloat32_t v562 = svdup_n_f32(v286); + svfloat32_t v563 = svdup_n_f32(v293); + svfloat32_t v564 = svdup_n_f32(v300); + float32x2_t *v581 = &v6[v330]; + float32x2_t *v590 = &v6[v337]; + float32x2_t *v608 = &v6[v351]; + float32x2_t *v617 = &v6[v358]; + float32x2_t *v626 = &v6[v365]; + float32x2_t *v635 = &v6[v372]; + float32x2_t *v644 = &v6[v379]; + float32x2_t *v653 = &v6[v386]; + float32x2_t *v662 = &v6[v393]; + float32x2_t *v671 = &v6[v400]; + float32x2_t *v680 = &v6[v407]; + float32x2_t *v689 = &v6[v414]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v429, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v105 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v510, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svint16_t v438 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v436), v545)); + svint16_t v447 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v445), v545)); + svint16_t v456 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v454), v545)); + svint16_t v465 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v463), v545)); + svint16_t v474 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v472), v545)); + svint16_t v483 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v481), v545)); + svint16_t v492 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v490), v545)); + svint16_t v501 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v499), v545)); + svint16_t v519 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v517), v545)); + svint16_t v528 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v526), v545)); + svint16_t v537 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v535), v545)); + svint16_t v546 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v544), v545)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v438, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v447, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v51 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v456, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v61 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v465, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v69 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v474, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v79 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v483, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v87 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v492, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v97 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v501, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v115 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v519, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v123 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v528, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v133 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v537, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v141 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v546, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v70 = svadd_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v71 = svsub_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v106 = svadd_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v52, v142); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v52, v142); + svfloat32_t v146 = svadd_f32_x(svptrue_b32(), v106, v88); + svfloat32_t v147 = svsub_f32_x(svptrue_b32(), v106, v88); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v70, v124); + svfloat32_t v149 = svsub_f32_x(svptrue_b32(), v70, v124); + svfloat32_t v233 = svadd_f32_x(svptrue_b32(), v53, v143); + svfloat32_t v234 = svsub_f32_x(svptrue_b32(), v53, v143); + svfloat32_t v235 = svadd_f32_x(svptrue_b32(), v107, v89); + svfloat32_t v236 = svsub_f32_x(svptrue_b32(), v107, v89); + svfloat32_t v237 = svadd_f32_x(svptrue_b32(), v71, v125); + svfloat32_t v238 = svsub_f32_x(svptrue_b32(), v71, v125); + svfloat32_t v150 = svadd_f32_x(svptrue_b32(), v144, v146); + svfloat32_t v153 = svsub_f32_x(svptrue_b32(), v144, v146); + svfloat32_t v154 = svsub_f32_x(svptrue_b32(), v146, v148); + svfloat32_t v155 = svsub_f32_x(svptrue_b32(), v148, v144); + svfloat32_t v156 = svadd_f32_x(svptrue_b32(), v145, v147); + svfloat32_t v158 = svsub_f32_x(svptrue_b32(), v145, v147); + svfloat32_t v159 = svsub_f32_x(svptrue_b32(), v147, v149); + svfloat32_t v160 = svsub_f32_x(svptrue_b32(), v149, v145); + svfloat32_t v239 = svadd_f32_x(svptrue_b32(), v233, v235); + svfloat32_t v242 = svsub_f32_x(svptrue_b32(), v233, v235); + svfloat32_t v243 = svsub_f32_x(svptrue_b32(), v235, v237); + svfloat32_t v244 = svsub_f32_x(svptrue_b32(), v237, v233); + svfloat32_t v245 = svadd_f32_x(svptrue_b32(), v234, v236); + svfloat32_t v247 = svsub_f32_x(svptrue_b32(), v234, v236); + svfloat32_t v248 = svsub_f32_x(svptrue_b32(), v236, v238); + svfloat32_t v249 = svsub_f32_x(svptrue_b32(), v238, v234); + svfloat32_t v151 = svadd_f32_x(svptrue_b32(), v150, v148); + svfloat32_t v157 = svadd_f32_x(svptrue_b32(), v156, v149); + svfloat32_t zero199 = svdup_n_f32(0); + svfloat32_t v199 = svcmla_f32_x(pred_full, zero199, v562, v158, 90); + svfloat32_t zero206 = svdup_n_f32(0); + svfloat32_t v206 = svcmla_f32_x(pred_full, zero206, v563, v159, 90); + svfloat32_t zero213 = svdup_n_f32(0); + svfloat32_t v213 = svcmla_f32_x(pred_full, zero213, v564, v160, 90); + svfloat32_t v240 = svadd_f32_x(svptrue_b32(), v239, v237); + svfloat32_t v246 = svadd_f32_x(svptrue_b32(), v245, v238); + svfloat32_t zero288 = svdup_n_f32(0); + svfloat32_t v288 = svcmla_f32_x(pred_full, zero288, v562, v247, 90); + svfloat32_t zero295 = svdup_n_f32(0); + svfloat32_t v295 = svcmla_f32_x(pred_full, zero295, v563, v248, 90); + svfloat32_t zero302 = svdup_n_f32(0); + svfloat32_t v302 = svcmla_f32_x(pred_full, zero302, v564, v249, 90); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v151, v34); + svfloat32_t zero192 = svdup_n_f32(0); + svfloat32_t v192 = svcmla_f32_x(pred_full, zero192, v561, v157, 90); + svfloat32_t v241 = svadd_f32_x(svptrue_b32(), v240, v35); + svfloat32_t zero281 = svdup_n_f32(0); + svfloat32_t v281 = svcmla_f32_x(pred_full, zero281, v561, v246, 90); + svfloat32_t v214 = svmla_f32_x(pred_full, v152, v151, v557); + svfloat32_t v221 = svadd_f32_x(svptrue_b32(), v192, v199); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v192, v199); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v192, v206); + svfloat32_t v303 = svmla_f32_x(pred_full, v241, v240, v557); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v281, v288); + svfloat32_t v312 = svsub_f32_x(svptrue_b32(), v281, v288); + svfloat32_t v314 = svsub_f32_x(svptrue_b32(), v281, v295); + svst1_f64(pred_full, (double *)(v572), svreinterpret_f64_f32(v152)); + svst1_f64(pred_full, (double *)(v581), svreinterpret_f64_f32(v241)); + svfloat32_t v215 = svmla_f32_x(pred_full, v214, v153, v558); + svfloat32_t v217 = svmls_f32_x(pred_full, v214, v153, v558); + svfloat32_t v219 = svmls_f32_x(pred_full, v214, v154, v559); + svfloat32_t v222 = svadd_f32_x(svptrue_b32(), v221, v206); + svfloat32_t v224 = svsub_f32_x(svptrue_b32(), v223, v213); + svfloat32_t v226 = svadd_f32_x(svptrue_b32(), v225, v213); + svfloat32_t v304 = svmla_f32_x(pred_full, v303, v242, v558); + svfloat32_t v306 = svmls_f32_x(pred_full, v303, v242, v558); + svfloat32_t v308 = svmls_f32_x(pred_full, v303, v243, v559); + svfloat32_t v311 = svadd_f32_x(svptrue_b32(), v310, v295); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v312, v302); + svfloat32_t v315 = svadd_f32_x(svptrue_b32(), v314, v302); + svfloat32_t v216 = svmla_f32_x(pred_full, v215, v154, v559); + svfloat32_t v218 = svmls_f32_x(pred_full, v217, v155, v560); + svfloat32_t v220 = svmla_f32_x(pred_full, v219, v155, v560); + svfloat32_t v305 = svmla_f32_x(pred_full, v304, v243, v559); + svfloat32_t v307 = svmls_f32_x(pred_full, v306, v244, v560); + svfloat32_t v309 = svmla_f32_x(pred_full, v308, v244, v560); + svfloat32_t v227 = svadd_f32_x(svptrue_b32(), v216, v222); + svfloat32_t v228 = svsub_f32_x(svptrue_b32(), v216, v222); + svfloat32_t v229 = svadd_f32_x(svptrue_b32(), v218, v224); + svfloat32_t v230 = svsub_f32_x(svptrue_b32(), v218, v224); + svfloat32_t v231 = svadd_f32_x(svptrue_b32(), v220, v226); + svfloat32_t v232 = svsub_f32_x(svptrue_b32(), v220, v226); + svfloat32_t v316 = svadd_f32_x(svptrue_b32(), v305, v311); + svfloat32_t v317 = svsub_f32_x(svptrue_b32(), v305, v311); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v307, v313); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v307, v313); + svfloat32_t v320 = svadd_f32_x(svptrue_b32(), v309, v315); + svfloat32_t v321 = svsub_f32_x(svptrue_b32(), v309, v315); + svst1_f64(pred_full, (double *)(v590), svreinterpret_f64_f32(v228)); + svst1_f64(pred_full, (double *)(v599), svreinterpret_f64_f32(v317)); + svst1_f64(pred_full, (double *)(v608), svreinterpret_f64_f32(v230)); + svst1_f64(pred_full, (double *)(v617), svreinterpret_f64_f32(v319)); + svst1_f64(pred_full, (double *)(v626), svreinterpret_f64_f32(v231)); + svst1_f64(pred_full, (double *)(v635), svreinterpret_f64_f32(v320)); + svst1_f64(pred_full, (double *)(v644), svreinterpret_f64_f32(v232)); + svst1_f64(pred_full, (double *)(v653), svreinterpret_f64_f32(v321)); + svst1_f64(pred_full, (double *)(v662), svreinterpret_f64_f32(v229)); + svst1_f64(pred_full, (double *)(v671), svreinterpret_f64_f32(v318)); + svst1_f64(pred_full, (double *)(v680), svreinterpret_f64_f32(v227)); + svst1_f64(pred_full, (double *)(v689), svreinterpret_f64_f32(v316)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu15(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v68 = vld1s_s16(&v5[istride]); + float v134 = -1.2500000000000000e+00F; + float v138 = 5.5901699437494745e-01F; + float v141 = 1.5388417685876268e+00F; + float v142 = -1.5388417685876268e+00F; + float v148 = 5.8778525229247325e-01F; + float v149 = -5.8778525229247325e-01F; + float v155 = 3.6327126400268028e-01F; + float v156 = -3.6327126400268028e-01F; + float v180 = -1.4999999999999998e+00F; + float v184 = 1.8749999999999998e+00F; + float v188 = -8.3852549156242107e-01F; + float v191 = -2.3082626528814396e+00F; + float v192 = 2.3082626528814396e+00F; + float v198 = -8.8167787843870971e-01F; + float v199 = 8.8167787843870971e-01F; + float v205 = -5.4490689600402031e-01F; + float v206 = 5.4490689600402031e-01F; + float v229 = 8.6602540378443871e-01F; + float v230 = -8.6602540378443871e-01F; + float v236 = -1.0825317547305484e+00F; + float v237 = 1.0825317547305484e+00F; + float v243 = 4.8412291827592718e-01F; + float v244 = -4.8412291827592718e-01F; + float32x2_t v246 = (float32x2_t){v4, v4}; + float v251 = -1.3326760640014592e+00F; + float v255 = -5.0903696045512736e-01F; + float v259 = -3.1460214309120460e-01F; + int16x4_t v34 = vld1s_s16(&v5[0]); + float32x2_t v69 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v68)), 15); + float32x2_t v135 = (float32x2_t){v134, v134}; + float32x2_t v139 = (float32x2_t){v138, v138}; + float32x2_t v143 = (float32x2_t){v141, v142}; + float32x2_t v150 = (float32x2_t){v148, v149}; + float32x2_t v157 = (float32x2_t){v155, v156}; + float32x2_t v181 = (float32x2_t){v180, v180}; + float32x2_t v185 = (float32x2_t){v184, v184}; + float32x2_t v189 = (float32x2_t){v188, v188}; + float32x2_t v193 = (float32x2_t){v191, v192}; + float32x2_t v200 = (float32x2_t){v198, v199}; + float32x2_t v207 = (float32x2_t){v205, v206}; + float32x2_t v231 = (float32x2_t){v229, v230}; + float32x2_t v238 = (float32x2_t){v236, v237}; + float32x2_t v245 = (float32x2_t){v243, v244}; + float32x2_t v252 = (float32x2_t){v251, v251}; + float32x2_t v256 = (float32x2_t){v255, v255}; + float32x2_t v260 = (float32x2_t){v259, v259}; + int16x4_t v20 = vld1s_s16(&v5[istride * 5]); + int16x4_t v26 = vld1s_s16(&v5[istride * 10]); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + int16x4_t v41 = vld1s_s16(&v5[istride * 8]); + int16x4_t v47 = vld1s_s16(&v5[istride * 13]); + int16x4_t v55 = vld1s_s16(&v5[istride * 3]); + int16x4_t v62 = vld1s_s16(&v5[istride * 11]); + int16x4_t v76 = vld1s_s16(&v5[istride * 6]); + int16x4_t v83 = vld1s_s16(&v5[istride * 14]); + int16x4_t v89 = vld1s_s16(&v5[istride * 4]); + int16x4_t v97 = vld1s_s16(&v5[istride * 9]); + int16x4_t v104 = vld1s_s16(&v5[istride * 2]); + int16x4_t v110 = vld1s_s16(&v5[istride * 7]); + int16x4_t v118 = vld1s_s16(&v5[istride * 12]); + float32x2_t v145 = vmul_f32(v246, v143); + float32x2_t v152 = vmul_f32(v246, v150); + float32x2_t v159 = vmul_f32(v246, v157); + float32x2_t v195 = vmul_f32(v246, v193); + float32x2_t v202 = vmul_f32(v246, v200); + float32x2_t v209 = vmul_f32(v246, v207); + float32x2_t v233 = vmul_f32(v246, v231); + float32x2_t v240 = vmul_f32(v246, v238); + float32x2_t v247 = vmul_f32(v246, v245); + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v42 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v41)), 15); + float32x2_t v48 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v47)), 15); + float32x2_t v56 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v55)), 15); + float32x2_t v63 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v62)), 15); + float32x2_t v77 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v76)), 15); + float32x2_t v84 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v83)), 15); + float32x2_t v90 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v89)), 15); + float32x2_t v98 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v97)), 15); + float32x2_t v105 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v104)), 15); + float32x2_t v111 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v110)), 15); + float32x2_t v119 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v118)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v49 = vadd_f32(v42, v48); + float32x2_t v50 = vsub_f32(v42, v48); + float32x2_t v70 = vadd_f32(v63, v69); + float32x2_t v71 = vsub_f32(v63, v69); + float32x2_t v91 = vadd_f32(v84, v90); + float32x2_t v92 = vsub_f32(v84, v90); + float32x2_t v112 = vadd_f32(v105, v111); + float32x2_t v113 = vsub_f32(v105, v111); + float32x2_t v36 = vadd_f32(v28, v35); + float32x2_t v57 = vadd_f32(v49, v56); + float32x2_t v78 = vadd_f32(v70, v77); + float32x2_t v99 = vadd_f32(v91, v98); + float32x2_t v120 = vadd_f32(v112, v119); + float32x2_t v171 = vadd_f32(v49, v112); + float32x2_t v172 = vsub_f32(v49, v112); + float32x2_t v173 = vadd_f32(v91, v70); + float32x2_t v174 = vsub_f32(v91, v70); + float32x2_t v221 = vadd_f32(v50, v113); + float32x2_t v222 = vsub_f32(v50, v113); + float32x2_t v223 = vadd_f32(v92, v71); + float32x2_t v224 = vsub_f32(v92, v71); + float32x2_t v121 = vadd_f32(v57, v120); + float32x2_t v122 = vsub_f32(v57, v120); + float32x2_t v123 = vadd_f32(v99, v78); + float32x2_t v124 = vsub_f32(v99, v78); + float32x2_t v175 = vadd_f32(v171, v173); + float32x2_t v176 = vsub_f32(v171, v173); + float32x2_t v177 = vadd_f32(v172, v174); + float32x2_t v196 = vrev64_f32(v172); + float32x2_t v210 = vrev64_f32(v174); + float32x2_t v225 = vadd_f32(v221, v223); + float32x2_t v226 = vsub_f32(v221, v223); + float32x2_t v227 = vadd_f32(v222, v224); + float32x2_t v253 = vmul_f32(v222, v252); + float32x2_t v261 = vmul_f32(v224, v260); + float32x2_t v125 = vadd_f32(v121, v123); + float32x2_t v126 = vsub_f32(v121, v123); + float32x2_t v127 = vadd_f32(v122, v124); + float32x2_t v146 = vrev64_f32(v122); + float32x2_t v160 = vrev64_f32(v124); + float32x2_t v178 = vadd_f32(v175, v28); + float32x2_t v186 = vmul_f32(v175, v185); + float32x2_t v190 = vmul_f32(v176, v189); + float32x2_t v197 = vmul_f32(v196, v195); + float32x2_t v203 = vrev64_f32(v177); + float32x2_t v211 = vmul_f32(v210, v209); + float32x2_t v228 = vadd_f32(v225, v29); + float32x2_t v241 = vrev64_f32(v225); + float32x2_t v248 = vrev64_f32(v226); + float32x2_t v257 = vmul_f32(v227, v256); + float32x2_t v128 = vadd_f32(v125, v36); + float32x2_t v136 = vmul_f32(v125, v135); + float32x2_t v140 = vmul_f32(v126, v139); + float32x2_t v147 = vmul_f32(v146, v145); + float32x2_t v153 = vrev64_f32(v127); + float32x2_t v161 = vmul_f32(v160, v159); + float32x2_t v182 = vmul_f32(v178, v181); + float32x2_t v204 = vmul_f32(v203, v202); + float32x2_t v234 = vrev64_f32(v228); + float32x2_t v242 = vmul_f32(v241, v240); + float32x2_t v249 = vmul_f32(v248, v247); + float32x2_t v265 = vsub_f32(v253, v257); + float32x2_t v266 = vadd_f32(v257, v261); + float32x2_t v154 = vmul_f32(v153, v152); + float32x2_t v162 = vadd_f32(v128, v136); + float32x2_t v212 = vadd_f32(v182, v186); + float32x2_t v215 = vsub_f32(v197, v204); + float32x2_t v216 = vadd_f32(v204, v211); + float32x2_t v235 = vmul_f32(v234, v233); + float32x2_t v271 = vadd_f32(v128, v182); + v6[0] = v128; + float32x2_t v163 = vadd_f32(v162, v140); + float32x2_t v164 = vsub_f32(v162, v140); + float32x2_t v165 = vsub_f32(v147, v154); + float32x2_t v166 = vadd_f32(v154, v161); + float32x2_t v213 = vadd_f32(v212, v190); + float32x2_t v214 = vsub_f32(v212, v190); + float32x2_t v262 = vadd_f32(v235, v242); + float32x2_t v272 = vadd_f32(v271, v235); + float32x2_t v273 = vsub_f32(v271, v235); + float32x2_t v167 = vadd_f32(v163, v165); + float32x2_t v168 = vsub_f32(v163, v165); + float32x2_t v169 = vadd_f32(v164, v166); + float32x2_t v170 = vsub_f32(v164, v166); + float32x2_t v217 = vadd_f32(v213, v215); + float32x2_t v218 = vsub_f32(v213, v215); + float32x2_t v219 = vadd_f32(v214, v216); + float32x2_t v220 = vsub_f32(v214, v216); + float32x2_t v263 = vadd_f32(v262, v249); + float32x2_t v264 = vsub_f32(v262, v249); + v6[ostride * 10] = v273; + v6[ostride * 5] = v272; + float32x2_t v267 = vadd_f32(v263, v265); + float32x2_t v268 = vsub_f32(v263, v265); + float32x2_t v269 = vadd_f32(v264, v266); + float32x2_t v270 = vsub_f32(v264, v266); + float32x2_t v289 = vadd_f32(v168, v218); + v6[ostride * 6] = v168; + float32x2_t v307 = vadd_f32(v170, v220); + v6[ostride * 12] = v170; + float32x2_t v325 = vadd_f32(v169, v219); + v6[ostride * 3] = v169; + float32x2_t v343 = vadd_f32(v167, v217); + v6[ostride * 9] = v167; + float32x2_t v290 = vadd_f32(v289, v268); + float32x2_t v291 = vsub_f32(v289, v268); + float32x2_t v308 = vadd_f32(v307, v270); + float32x2_t v309 = vsub_f32(v307, v270); + float32x2_t v326 = vadd_f32(v325, v269); + float32x2_t v327 = vsub_f32(v325, v269); + float32x2_t v344 = vadd_f32(v343, v267); + float32x2_t v345 = vsub_f32(v343, v267); + v6[ostride] = v291; + v6[ostride * 11] = v290; + v6[ostride * 7] = v309; + v6[ostride * 2] = v308; + v6[ostride * 13] = v327; + v6[ostride * 8] = v326; + v6[ostride * 4] = v345; + v6[ostride * 14] = v344; + v5 += 1 * idist; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu15(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v167 = -1.2500000000000000e+00F; + float v172 = 5.5901699437494745e-01F; + float v177 = -1.5388417685876268e+00F; + float v184 = -5.8778525229247325e-01F; + float v191 = -3.6327126400268028e-01F; + float v215 = -1.4999999999999998e+00F; + float v220 = 1.8749999999999998e+00F; + float v225 = -8.3852549156242107e-01F; + float v230 = 2.3082626528814396e+00F; + float v237 = 8.8167787843870971e-01F; + float v244 = 5.4490689600402031e-01F; + float v268 = -8.6602540378443871e-01F; + float v275 = 1.0825317547305484e+00F; + float v282 = -4.8412291827592718e-01F; + float v289 = -1.3326760640014592e+00F; + float v294 = -5.0903696045512736e-01F; + float v299 = -3.1460214309120460e-01F; + const int32_t *v502 = &v5[v0]; + float32x2_t *v629 = &v6[v2]; + int64_t v19 = v0 * 5; + int64_t v27 = v0 * 10; + int64_t v46 = v0 * 8; + int64_t v54 = v0 * 13; + int64_t v64 = v0 * 3; + int64_t v73 = v0 * 11; + int64_t v91 = v0 * 6; + int64_t v100 = v0 * 14; + int64_t v108 = v0 * 4; + int64_t v118 = v0 * 9; + int64_t v127 = v0 * 2; + int64_t v135 = v0 * 7; + int64_t v145 = v0 * 12; + float v180 = v4 * v177; + float v187 = v4 * v184; + float v194 = v4 * v191; + float v233 = v4 * v230; + float v240 = v4 * v237; + float v247 = v4 * v244; + float v271 = v4 * v268; + float v278 = v4 * v275; + float v285 = v4 * v282; + int64_t v323 = v2 * 10; + int64_t v330 = v2 * 5; + int64_t v340 = v2 * 6; + int64_t v354 = v2 * 11; + int64_t v364 = v2 * 12; + int64_t v371 = v2 * 7; + int64_t v378 = v2 * 2; + int64_t v388 = v2 * 3; + int64_t v395 = v2 * 13; + int64_t v402 = v2 * 8; + int64_t v412 = v2 * 9; + int64_t v419 = v2 * 4; + int64_t v426 = v2 * 14; + const int32_t *v457 = &v5[0]; + svint64_t v566 = svindex_s64(0, v1); + svfloat32_t v569 = svdup_n_f32(v167); + svfloat32_t v570 = svdup_n_f32(v172); + svfloat32_t v574 = svdup_n_f32(v215); + svfloat32_t v575 = svdup_n_f32(v220); + svfloat32_t v576 = svdup_n_f32(v225); + svfloat32_t v583 = svdup_n_f32(v289); + svfloat32_t v584 = svdup_n_f32(v294); + svfloat32_t v585 = svdup_n_f32(v299); + float32x2_t *v593 = &v6[0]; + const int32_t *v438 = &v5[v19]; + const int32_t *v447 = &v5[v27]; + svint16_t v459 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v457), v566)); + const int32_t *v466 = &v5[v46]; + const int32_t *v475 = &v5[v54]; + const int32_t *v484 = &v5[v64]; + const int32_t *v493 = &v5[v73]; + svint16_t v504 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v502), v566)); + const int32_t *v511 = &v5[v91]; + const int32_t *v520 = &v5[v100]; + const int32_t *v529 = &v5[v108]; + const int32_t *v538 = &v5[v118]; + const int32_t *v547 = &v5[v127]; + const int32_t *v556 = &v5[v135]; + const int32_t *v565 = &v5[v145]; + svfloat32_t v571 = svdup_n_f32(v180); + svfloat32_t v572 = svdup_n_f32(v187); + svfloat32_t v573 = svdup_n_f32(v194); + svfloat32_t v577 = svdup_n_f32(v233); + svfloat32_t v578 = svdup_n_f32(v240); + svfloat32_t v579 = svdup_n_f32(v247); + svfloat32_t v580 = svdup_n_f32(v271); + svfloat32_t v581 = svdup_n_f32(v278); + svfloat32_t v582 = svdup_n_f32(v285); + float32x2_t *v602 = &v6[v323]; + float32x2_t *v611 = &v6[v330]; + float32x2_t *v620 = &v6[v340]; + float32x2_t *v638 = &v6[v354]; + float32x2_t *v647 = &v6[v364]; + float32x2_t *v656 = &v6[v371]; + float32x2_t *v665 = &v6[v378]; + float32x2_t *v674 = &v6[v388]; + float32x2_t *v683 = &v6[v395]; + float32x2_t *v692 = &v6[v402]; + float32x2_t *v701 = &v6[v412]; + float32x2_t *v710 = &v6[v419]; + float32x2_t *v719 = &v6[v426]; + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v459, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v87 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v504, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svint16_t v440 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v438), v566)); + svint16_t v449 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v447), v566)); + svint16_t v468 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v466), v566)); + svint16_t v477 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v475), v566)); + svint16_t v486 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v484), v566)); + svint16_t v495 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v493), v566)); + svint16_t v513 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v511), v566)); + svint16_t v522 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v520), v566)); + svint16_t v531 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v529), v566)); + svint16_t v540 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v538), v566)); + svint16_t v549 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v547), v566)); + svint16_t v558 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v556), v566)); + svint16_t v567 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v565), v566)); + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v440, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v449, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v52 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v468, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v60 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v477, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v70 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v486, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v79 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v495, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v97 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v513, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v106 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v522, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v114 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v531, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v124 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v540, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v133 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v549, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v141 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v558, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v151 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v567, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v61 = svadd_f32_x(svptrue_b32(), v52, v60); + svfloat32_t v62 = svsub_f32_x(svptrue_b32(), v52, v60); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v115 = svadd_f32_x(svptrue_b32(), v106, v114); + svfloat32_t v116 = svsub_f32_x(svptrue_b32(), v106, v114); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v44 = svadd_f32_x(svptrue_b32(), v34, v43); + svfloat32_t v71 = svadd_f32_x(svptrue_b32(), v61, v70); + svfloat32_t v98 = svadd_f32_x(svptrue_b32(), v88, v97); + svfloat32_t v125 = svadd_f32_x(svptrue_b32(), v115, v124); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v142, v151); + svfloat32_t v206 = svadd_f32_x(svptrue_b32(), v61, v142); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v61, v142); + svfloat32_t v208 = svadd_f32_x(svptrue_b32(), v115, v88); + svfloat32_t v209 = svsub_f32_x(svptrue_b32(), v115, v88); + svfloat32_t v259 = svadd_f32_x(svptrue_b32(), v62, v143); + svfloat32_t v260 = svsub_f32_x(svptrue_b32(), v62, v143); + svfloat32_t v261 = svadd_f32_x(svptrue_b32(), v116, v89); + svfloat32_t v262 = svsub_f32_x(svptrue_b32(), v116, v89); + svfloat32_t v153 = svadd_f32_x(svptrue_b32(), v71, v152); + svfloat32_t v154 = svsub_f32_x(svptrue_b32(), v71, v152); + svfloat32_t v155 = svadd_f32_x(svptrue_b32(), v125, v98); + svfloat32_t v156 = svsub_f32_x(svptrue_b32(), v125, v98); + svfloat32_t v210 = svadd_f32_x(svptrue_b32(), v206, v208); + svfloat32_t v211 = svsub_f32_x(svptrue_b32(), v206, v208); + svfloat32_t v212 = svadd_f32_x(svptrue_b32(), v207, v209); + svfloat32_t zero235 = svdup_n_f32(0); + svfloat32_t v235 = svcmla_f32_x(pred_full, zero235, v577, v207, 90); + svfloat32_t v263 = svadd_f32_x(svptrue_b32(), v259, v261); + svfloat32_t v264 = svsub_f32_x(svptrue_b32(), v259, v261); + svfloat32_t v265 = svadd_f32_x(svptrue_b32(), v260, v262); + svfloat32_t v302 = svmul_f32_x(svptrue_b32(), v262, v585); + svfloat32_t v157 = svadd_f32_x(svptrue_b32(), v153, v155); + svfloat32_t v158 = svsub_f32_x(svptrue_b32(), v153, v155); + svfloat32_t v159 = svadd_f32_x(svptrue_b32(), v154, v156); + svfloat32_t zero182 = svdup_n_f32(0); + svfloat32_t v182 = svcmla_f32_x(pred_full, zero182, v571, v154, 90); + svfloat32_t v213 = svadd_f32_x(svptrue_b32(), v210, v34); + svfloat32_t v223 = svmul_f32_x(svptrue_b32(), v210, v575); + svfloat32_t zero242 = svdup_n_f32(0); + svfloat32_t v242 = svcmla_f32_x(pred_full, zero242, v578, v212, 90); + svfloat32_t v266 = svadd_f32_x(svptrue_b32(), v263, v35); + svfloat32_t zero287 = svdup_n_f32(0); + svfloat32_t v287 = svcmla_f32_x(pred_full, zero287, v582, v264, 90); + svfloat32_t v297 = svmul_f32_x(svptrue_b32(), v265, v584); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v157, v44); + svfloat32_t zero189 = svdup_n_f32(0); + svfloat32_t v189 = svcmla_f32_x(pred_full, zero189, v572, v159, 90); + svfloat32_t v253 = svsub_f32_x(svptrue_b32(), v235, v242); + svfloat32_t v254 = svcmla_f32_x(pred_full, v242, v579, v209, 90); + svfloat32_t zero273 = svdup_n_f32(0); + svfloat32_t v273 = svcmla_f32_x(pred_full, zero273, v580, v266, 90); + svfloat32_t v306 = svnmls_f32_x(pred_full, v297, v260, v583); + svfloat32_t v307 = svmla_f32_x(pred_full, v302, v265, v584); + svfloat32_t v197 = svmla_f32_x(pred_full, v160, v157, v569); + svfloat32_t v200 = svsub_f32_x(svptrue_b32(), v182, v189); + svfloat32_t v201 = svcmla_f32_x(pred_full, v189, v573, v156, 90); + svfloat32_t v250 = svmla_f32_x(pred_full, v223, v213, v574); + svfloat32_t v303 = svcmla_f32_x(pred_full, v273, v581, v263, 90); + svfloat32_t v312 = svmla_f32_x(pred_full, v160, v213, v574); + svst1_f64(pred_full, (double *)(v593), svreinterpret_f64_f32(v160)); + svfloat32_t v198 = svmla_f32_x(pred_full, v197, v158, v570); + svfloat32_t v199 = svmls_f32_x(pred_full, v197, v158, v570); + svfloat32_t v251 = svmla_f32_x(pred_full, v250, v211, v576); + svfloat32_t v252 = svmls_f32_x(pred_full, v250, v211, v576); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v303, v287); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v303, v287); + svfloat32_t v313 = svadd_f32_x(svptrue_b32(), v312, v273); + svfloat32_t v314 = svsub_f32_x(svptrue_b32(), v312, v273); + svfloat32_t v202 = svadd_f32_x(svptrue_b32(), v198, v200); + svfloat32_t v203 = svsub_f32_x(svptrue_b32(), v198, v200); + svfloat32_t v204 = svadd_f32_x(svptrue_b32(), v199, v201); + svfloat32_t v205 = svsub_f32_x(svptrue_b32(), v199, v201); + svfloat32_t v255 = svadd_f32_x(svptrue_b32(), v251, v253); + svfloat32_t v256 = svsub_f32_x(svptrue_b32(), v251, v253); + svfloat32_t v257 = svadd_f32_x(svptrue_b32(), v252, v254); + svfloat32_t v258 = svsub_f32_x(svptrue_b32(), v252, v254); + svfloat32_t v308 = svadd_f32_x(svptrue_b32(), v304, v306); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v304, v306); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v305, v307); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v305, v307); + svst1_f64(pred_full, (double *)(v602), svreinterpret_f64_f32(v314)); + svst1_f64(pred_full, (double *)(v611), svreinterpret_f64_f32(v313)); + svfloat32_t v336 = svadd_f32_x(svptrue_b32(), v203, v256); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v205, v258); + svfloat32_t v384 = svadd_f32_x(svptrue_b32(), v204, v257); + svfloat32_t v408 = svadd_f32_x(svptrue_b32(), v202, v255); + svst1_f64(pred_full, (double *)(v620), svreinterpret_f64_f32(v203)); + svst1_f64(pred_full, (double *)(v647), svreinterpret_f64_f32(v205)); + svst1_f64(pred_full, (double *)(v674), svreinterpret_f64_f32(v204)); + svst1_f64(pred_full, (double *)(v701), svreinterpret_f64_f32(v202)); + svfloat32_t v337 = svadd_f32_x(svptrue_b32(), v336, v309); + svfloat32_t v338 = svsub_f32_x(svptrue_b32(), v336, v309); + svfloat32_t v361 = svadd_f32_x(svptrue_b32(), v360, v311); + svfloat32_t v362 = svsub_f32_x(svptrue_b32(), v360, v311); + svfloat32_t v385 = svadd_f32_x(svptrue_b32(), v384, v310); + svfloat32_t v386 = svsub_f32_x(svptrue_b32(), v384, v310); + svfloat32_t v409 = svadd_f32_x(svptrue_b32(), v408, v308); + svfloat32_t v410 = svsub_f32_x(svptrue_b32(), v408, v308); + svst1_f64(pred_full, (double *)(v629), svreinterpret_f64_f32(v338)); + svst1_f64(pred_full, (double *)(v638), svreinterpret_f64_f32(v337)); + svst1_f64(pred_full, (double *)(v656), svreinterpret_f64_f32(v362)); + svst1_f64(pred_full, (double *)(v665), svreinterpret_f64_f32(v361)); + svst1_f64(pred_full, (double *)(v683), svreinterpret_f64_f32(v386)); + svst1_f64(pred_full, (double *)(v692), svreinterpret_f64_f32(v385)); + svst1_f64(pred_full, (double *)(v710), svreinterpret_f64_f32(v410)); + svst1_f64(pred_full, (double *)(v719), svreinterpret_f64_f32(v409)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu16(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v76 = vld1s_s16(&v5[istride]); + float v197 = 1.0000000000000000e+00F; + float v198 = -1.0000000000000000e+00F; + float v205 = -7.0710678118654746e-01F; + float v212 = 7.0710678118654757e-01F; + float v215 = 9.2387953251128674e-01F; + float v216 = -9.2387953251128674e-01F; + float v223 = 5.4119610014619690e-01F; + float v230 = -1.3065629648763766e+00F; + float32x2_t v232 = (float32x2_t){v4, v4}; + float v237 = 3.8268343236508984e-01F; + float v241 = 1.3065629648763766e+00F; + float v245 = -5.4119610014619690e-01F; + int16x4_t v20 = vld1s_s16(&v5[0]); + float32x2_t v77 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v76)), 15); + float32x2_t v199 = (float32x2_t){v197, v198}; + float32x2_t v206 = (float32x2_t){v212, v205}; + float32x2_t v213 = (float32x2_t){v212, v212}; + float32x2_t v217 = (float32x2_t){v215, v216}; + float32x2_t v224 = (float32x2_t){v245, v223}; + float32x2_t v231 = (float32x2_t){v241, v230}; + float32x2_t v238 = (float32x2_t){v237, v237}; + float32x2_t v242 = (float32x2_t){v241, v241}; + float32x2_t v246 = (float32x2_t){v245, v245}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v26 = vld1s_s16(&v5[istride * 8]); + int16x4_t v34 = vld1s_s16(&v5[istride * 4]); + int16x4_t v40 = vld1s_s16(&v5[istride * 12]); + int16x4_t v48 = vld1s_s16(&v5[istride * 2]); + int16x4_t v54 = vld1s_s16(&v5[istride * 10]); + int16x4_t v62 = vld1s_s16(&v5[istride * 6]); + int16x4_t v68 = vld1s_s16(&v5[istride * 14]); + int16x4_t v82 = vld1s_s16(&v5[istride * 9]); + int16x4_t v90 = vld1s_s16(&v5[istride * 5]); + int16x4_t v96 = vld1s_s16(&v5[istride * 13]); + int16x4_t v104 = vld1s_s16(&v5[istride * 3]); + int16x4_t v110 = vld1s_s16(&v5[istride * 11]); + int16x4_t v118 = vld1s_s16(&v5[istride * 7]); + int16x4_t v124 = vld1s_s16(&v5[istride * 15]); + float32x2_t v201 = vmul_f32(v232, v199); + float32x2_t v208 = vmul_f32(v232, v206); + float32x2_t v219 = vmul_f32(v232, v217); + float32x2_t v226 = vmul_f32(v232, v224); + float32x2_t v233 = vmul_f32(v232, v231); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v49 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v48)), 15); + float32x2_t v55 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v54)), 15); + float32x2_t v63 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v62)), 15); + float32x2_t v69 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v68)), 15); + float32x2_t v83 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v82)), 15); + float32x2_t v91 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v90)), 15); + float32x2_t v97 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v96)), 15); + float32x2_t v105 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v104)), 15); + float32x2_t v111 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v110)), 15); + float32x2_t v119 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v118)), 15); + float32x2_t v125 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v124)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v56 = vadd_f32(v49, v55); + float32x2_t v57 = vsub_f32(v49, v55); + float32x2_t v70 = vadd_f32(v63, v69); + float32x2_t v71 = vsub_f32(v63, v69); + float32x2_t v84 = vadd_f32(v77, v83); + float32x2_t v85 = vsub_f32(v77, v83); + float32x2_t v98 = vadd_f32(v91, v97); + float32x2_t v99 = vsub_f32(v91, v97); + float32x2_t v112 = vadd_f32(v105, v111); + float32x2_t v113 = vsub_f32(v105, v111); + float32x2_t v126 = vadd_f32(v119, v125); + float32x2_t v127 = vsub_f32(v119, v125); + float32x2_t v128 = vadd_f32(v28, v42); + float32x2_t v129 = vsub_f32(v28, v42); + float32x2_t v130 = vadd_f32(v56, v70); + float32x2_t v131 = vsub_f32(v56, v70); + float32x2_t v132 = vadd_f32(v84, v98); + float32x2_t v133 = vsub_f32(v84, v98); + float32x2_t v134 = vadd_f32(v112, v126); + float32x2_t v135 = vsub_f32(v112, v126); + float32x2_t v144 = vadd_f32(v57, v71); + float32x2_t v145 = vsub_f32(v57, v71); + float32x2_t v146 = vadd_f32(v85, v127); + float32x2_t v147 = vsub_f32(v85, v127); + float32x2_t v148 = vadd_f32(v99, v113); + float32x2_t v149 = vsub_f32(v99, v113); + float32x2_t v202 = vrev64_f32(v43); + float32x2_t v136 = vadd_f32(v128, v130); + float32x2_t v137 = vsub_f32(v128, v130); + float32x2_t v138 = vadd_f32(v132, v134); + float32x2_t v139 = vsub_f32(v132, v134); + float32x2_t v142 = vadd_f32(v133, v135); + float32x2_t v143 = vsub_f32(v133, v135); + float32x2_t v150 = vadd_f32(v146, v148); + float32x2_t v151 = vadd_f32(v147, v149); + float32x2_t v180 = vrev64_f32(v131); + float32x2_t v203 = vmul_f32(v202, v201); + float32x2_t v209 = vrev64_f32(v144); + float32x2_t v214 = vmul_f32(v145, v213); + float32x2_t v227 = vrev64_f32(v146); + float32x2_t v234 = vrev64_f32(v148); + float32x2_t v243 = vmul_f32(v147, v242); + float32x2_t v247 = vmul_f32(v149, v246); + float32x2_t v140 = vadd_f32(v136, v138); + float32x2_t v141 = vsub_f32(v136, v138); + float32x2_t v169 = vrev64_f32(v139); + float32x2_t v181 = vmul_f32(v180, v201); + float32x2_t v187 = vrev64_f32(v142); + float32x2_t v192 = vmul_f32(v143, v213); + float32x2_t v210 = vmul_f32(v209, v208); + float32x2_t v220 = vrev64_f32(v150); + float32x2_t v228 = vmul_f32(v227, v226); + float32x2_t v235 = vmul_f32(v234, v233); + float32x2_t v239 = vmul_f32(v151, v238); + float32x2_t v258 = vadd_f32(v29, v214); + float32x2_t v259 = vsub_f32(v29, v214); + float32x2_t v170 = vmul_f32(v169, v201); + float32x2_t v188 = vmul_f32(v187, v208); + float32x2_t v221 = vmul_f32(v220, v219); + float32x2_t v250 = vadd_f32(v129, v192); + float32x2_t v252 = vsub_f32(v129, v192); + float32x2_t v260 = vadd_f32(v203, v210); + float32x2_t v261 = vsub_f32(v203, v210); + float32x2_t v264 = vsub_f32(v243, v239); + float32x2_t v265 = vsub_f32(v247, v239); + float32x2_t v266 = vsub_f32(v239, v243); + float32x2_t v267 = vsub_f32(v239, v247); + v6[0] = v140; + v6[ostride * 8] = v141; + float32x2_t v248 = vadd_f32(v137, v170); + float32x2_t v249 = vsub_f32(v137, v170); + float32x2_t v251 = vadd_f32(v181, v188); + float32x2_t v253 = vsub_f32(v188, v181); + float32x2_t v262 = vadd_f32(v221, v228); + float32x2_t v263 = vsub_f32(v221, v235); + float32x2_t v268 = vadd_f32(v258, v264); + float32x2_t v269 = vsub_f32(v258, v264); + float32x2_t v270 = vadd_f32(v258, v266); + float32x2_t v271 = vsub_f32(v258, v266); + float32x2_t v272 = vadd_f32(v259, v261); + float32x2_t v273 = vsub_f32(v259, v261); + float32x2_t v274 = vadd_f32(v259, v267); + float32x2_t v275 = vsub_f32(v259, v267); + float32x2_t v254 = vadd_f32(v250, v251); + float32x2_t v255 = vadd_f32(v252, v253); + float32x2_t v256 = vsub_f32(v252, v253); + float32x2_t v257 = vsub_f32(v250, v251); + float32x2_t v278 = vadd_f32(v262, v260); + float32x2_t v279 = vsub_f32(v262, v260); + float32x2_t v280 = vadd_f32(v263, v265); + float32x2_t v281 = vsub_f32(v263, v265); + float32x2_t v282 = vadd_f32(v263, v261); + float32x2_t v283 = vsub_f32(v263, v261); + v6[ostride * 4] = v249; + v6[ostride * 12] = v248; + float32x2_t v284 = vadd_f32(v268, v278); + float32x2_t v285 = vadd_f32(v269, v279); + float32x2_t v286 = vsub_f32(v270, v279); + float32x2_t v287 = vsub_f32(v271, v278); + float32x2_t v288 = vadd_f32(v272, v280); + float32x2_t v289 = vadd_f32(v273, v281); + float32x2_t v290 = vsub_f32(v274, v283); + float32x2_t v291 = vsub_f32(v275, v282); + v6[ostride * 2] = v257; + v6[ostride * 6] = v256; + v6[ostride * 10] = v255; + v6[ostride * 14] = v254; + v6[ostride] = v287; + v6[ostride * 3] = v290; + v6[ostride * 5] = v291; + v6[ostride * 7] = v286; + v6[ostride * 9] = v285; + v6[ostride * 11] = v288; + v6[ostride * 13] = v289; + v6[ostride * 15] = v284; + v5 += 1 * idist; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu16(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v238 = -1.0000000000000000e+00F; + float v245 = -7.0710678118654746e-01F; + float v252 = 7.0710678118654757e-01F; + float v257 = -9.2387953251128674e-01F; + float v264 = 5.4119610014619690e-01F; + float v271 = -1.3065629648763766e+00F; + float v278 = 3.8268343236508984e-01F; + float v283 = 1.3065629648763766e+00F; + float v288 = -5.4119610014619690e-01F; + const int32_t *v527 = &v5[v0]; + float32x2_t *v627 = &v6[v2]; + int64_t v27 = v0 * 8; + int64_t v37 = v0 * 4; + int64_t v45 = v0 * 12; + int64_t v55 = v0 * 2; + int64_t v63 = v0 * 10; + int64_t v73 = v0 * 6; + int64_t v81 = v0 * 14; + int64_t v99 = v0 * 9; + int64_t v109 = v0 * 5; + int64_t v117 = v0 * 13; + int64_t v127 = v0 * 3; + int64_t v135 = v0 * 11; + int64_t v145 = v0 * 7; + int64_t v153 = v0 * 15; + float v241 = v4 * v238; + float v248 = v4 * v245; + float v260 = v4 * v257; + float v267 = v4 * v264; + float v274 = v4 * v271; + int64_t v351 = v2 * 2; + int64_t v358 = v2 * 3; + int64_t v365 = v2 * 4; + int64_t v372 = v2 * 5; + int64_t v379 = v2 * 6; + int64_t v386 = v2 * 7; + int64_t v393 = v2 * 8; + int64_t v400 = v2 * 9; + int64_t v407 = v2 * 10; + int64_t v414 = v2 * 11; + int64_t v421 = v2 * 12; + int64_t v428 = v2 * 13; + int64_t v435 = v2 * 14; + int64_t v442 = v2 * 15; + const int32_t *v455 = &v5[0]; + svint64_t v591 = svindex_s64(0, v1); + svfloat32_t v604 = svdup_n_f32(v252); + svfloat32_t v608 = svdup_n_f32(v278); + svfloat32_t v609 = svdup_n_f32(v283); + svfloat32_t v610 = svdup_n_f32(v288); + float32x2_t *v618 = &v6[0]; + svint16_t v457 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v455), v591)); + const int32_t *v464 = &v5[v27]; + const int32_t *v473 = &v5[v37]; + const int32_t *v482 = &v5[v45]; + const int32_t *v491 = &v5[v55]; + const int32_t *v500 = &v5[v63]; + const int32_t *v509 = &v5[v73]; + const int32_t *v518 = &v5[v81]; + svint16_t v529 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v527), v591)); + const int32_t *v536 = &v5[v99]; + const int32_t *v545 = &v5[v109]; + const int32_t *v554 = &v5[v117]; + const int32_t *v563 = &v5[v127]; + const int32_t *v572 = &v5[v135]; + const int32_t *v581 = &v5[v145]; + const int32_t *v590 = &v5[v153]; + svfloat32_t v602 = svdup_n_f32(v241); + svfloat32_t v603 = svdup_n_f32(v248); + svfloat32_t v605 = svdup_n_f32(v260); + svfloat32_t v606 = svdup_n_f32(v267); + svfloat32_t v607 = svdup_n_f32(v274); + float32x2_t *v636 = &v6[v351]; + float32x2_t *v645 = &v6[v358]; + float32x2_t *v654 = &v6[v365]; + float32x2_t *v663 = &v6[v372]; + float32x2_t *v672 = &v6[v379]; + float32x2_t *v681 = &v6[v386]; + float32x2_t *v690 = &v6[v393]; + float32x2_t *v699 = &v6[v400]; + float32x2_t *v708 = &v6[v407]; + float32x2_t *v717 = &v6[v414]; + float32x2_t *v726 = &v6[v421]; + float32x2_t *v735 = &v6[v428]; + float32x2_t *v744 = &v6[v435]; + float32x2_t *v753 = &v6[v442]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v457, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v97 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v529, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svint16_t v466 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v464), v591)); + svint16_t v475 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v473), v591)); + svint16_t v484 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v482), v591)); + svint16_t v493 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v491), v591)); + svint16_t v502 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v500), v591)); + svint16_t v511 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v509), v591)); + svint16_t v520 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v518), v591)); + svint16_t v538 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v536), v591)); + svint16_t v547 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v545), v591)); + svint16_t v556 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v554), v591)); + svint16_t v565 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v563), v591)); + svint16_t v574 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v572), v591)); + svint16_t v583 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v581), v591)); + svint16_t v592 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v590), v591)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v466, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v475, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v51 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v484, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v61 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v493, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v69 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v502, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v79 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v511, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v87 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v520, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v105 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v538, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v115 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v547, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v123 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v556, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v133 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v565, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v141 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v574, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v151 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v583, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v159 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v592, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v70 = svadd_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v71 = svsub_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v106 = svadd_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v151, v159); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v151, v159); + svfloat32_t v162 = svadd_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v163 = svsub_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v164 = svadd_f32_x(svptrue_b32(), v70, v88); + svfloat32_t v165 = svsub_f32_x(svptrue_b32(), v70, v88); + svfloat32_t v166 = svadd_f32_x(svptrue_b32(), v106, v124); + svfloat32_t v167 = svsub_f32_x(svptrue_b32(), v106, v124); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v142, v160); + svfloat32_t v169 = svsub_f32_x(svptrue_b32(), v142, v160); + svfloat32_t v178 = svadd_f32_x(svptrue_b32(), v71, v89); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v71, v89); + svfloat32_t v180 = svadd_f32_x(svptrue_b32(), v107, v161); + svfloat32_t v181 = svsub_f32_x(svptrue_b32(), v107, v161); + svfloat32_t v182 = svadd_f32_x(svptrue_b32(), v125, v143); + svfloat32_t v183 = svsub_f32_x(svptrue_b32(), v125, v143); + svfloat32_t zero243 = svdup_n_f32(0); + svfloat32_t v243 = svcmla_f32_x(pred_full, zero243, v602, v53, 90); + svfloat32_t v170 = svadd_f32_x(svptrue_b32(), v162, v164); + svfloat32_t v171 = svsub_f32_x(svptrue_b32(), v162, v164); + svfloat32_t v172 = svadd_f32_x(svptrue_b32(), v166, v168); + svfloat32_t v173 = svsub_f32_x(svptrue_b32(), v166, v168); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v167, v169); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v167, v169); + svfloat32_t v184 = svadd_f32_x(svptrue_b32(), v180, v182); + svfloat32_t v185 = svadd_f32_x(svptrue_b32(), v181, v183); + svfloat32_t zero219 = svdup_n_f32(0); + svfloat32_t v219 = svcmla_f32_x(pred_full, zero219, v602, v165, 90); + svfloat32_t zero250 = svdup_n_f32(0); + svfloat32_t v250 = svcmla_f32_x(pred_full, zero250, v603, v178, 90); + svfloat32_t zero276 = svdup_n_f32(0); + svfloat32_t v276 = svcmla_f32_x(pred_full, zero276, v607, v182, 90); + svfloat32_t v286 = svmul_f32_x(svptrue_b32(), v181, v609); + svfloat32_t v291 = svmul_f32_x(svptrue_b32(), v183, v610); + svfloat32_t v174 = svadd_f32_x(svptrue_b32(), v170, v172); + svfloat32_t v175 = svsub_f32_x(svptrue_b32(), v170, v172); + svfloat32_t zero207 = svdup_n_f32(0); + svfloat32_t v207 = svcmla_f32_x(pred_full, zero207, v602, v173, 90); + svfloat32_t zero226 = svdup_n_f32(0); + svfloat32_t v226 = svcmla_f32_x(pred_full, zero226, v603, v176, 90); + svfloat32_t zero262 = svdup_n_f32(0); + svfloat32_t v262 = svcmla_f32_x(pred_full, zero262, v605, v184, 90); + svfloat32_t v281 = svmul_f32_x(svptrue_b32(), v185, v608); + svfloat32_t v302 = svmla_f32_x(pred_full, v35, v179, v604); + svfloat32_t v303 = svmls_f32_x(pred_full, v35, v179, v604); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v243, v250); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v243, v250); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v171, v207); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v171, v207); + svfloat32_t v294 = svmla_f32_x(pred_full, v163, v177, v604); + svfloat32_t v295 = svadd_f32_x(svptrue_b32(), v219, v226); + svfloat32_t v296 = svmls_f32_x(pred_full, v163, v177, v604); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v226, v219); + svfloat32_t v306 = svcmla_f32_x(pred_full, v262, v606, v180, 90); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v262, v276); + svfloat32_t v308 = svnmls_f32_x(pred_full, v281, v181, v609); + svfloat32_t v309 = svnmls_f32_x(pred_full, v281, v183, v610); + svfloat32_t v310 = svnmls_f32_x(pred_full, v286, v185, v608); + svfloat32_t v311 = svnmls_f32_x(pred_full, v291, v185, v608); + svfloat32_t v316 = svadd_f32_x(svptrue_b32(), v303, v305); + svfloat32_t v317 = svsub_f32_x(svptrue_b32(), v303, v305); + svst1_f64(pred_full, (double *)(v618), svreinterpret_f64_f32(v174)); + svst1_f64(pred_full, (double *)(v690), svreinterpret_f64_f32(v175)); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v294, v295); + svfloat32_t v299 = svadd_f32_x(svptrue_b32(), v296, v297); + svfloat32_t v300 = svsub_f32_x(svptrue_b32(), v296, v297); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v294, v295); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v302, v308); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v302, v308); + svfloat32_t v314 = svadd_f32_x(svptrue_b32(), v302, v310); + svfloat32_t v315 = svsub_f32_x(svptrue_b32(), v302, v310); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v303, v311); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v303, v311); + svfloat32_t v322 = svadd_f32_x(svptrue_b32(), v306, v304); + svfloat32_t v323 = svsub_f32_x(svptrue_b32(), v306, v304); + svfloat32_t v324 = svadd_f32_x(svptrue_b32(), v307, v309); + svfloat32_t v325 = svsub_f32_x(svptrue_b32(), v307, v309); + svfloat32_t v326 = svadd_f32_x(svptrue_b32(), v307, v305); + svfloat32_t v327 = svsub_f32_x(svptrue_b32(), v307, v305); + svst1_f64(pred_full, (double *)(v654), svreinterpret_f64_f32(v293)); + svst1_f64(pred_full, (double *)(v726), svreinterpret_f64_f32(v292)); + svfloat32_t v328 = svadd_f32_x(svptrue_b32(), v312, v322); + svfloat32_t v329 = svadd_f32_x(svptrue_b32(), v313, v323); + svfloat32_t v330 = svsub_f32_x(svptrue_b32(), v314, v323); + svfloat32_t v331 = svsub_f32_x(svptrue_b32(), v315, v322); + svfloat32_t v332 = svadd_f32_x(svptrue_b32(), v316, v324); + svfloat32_t v333 = svadd_f32_x(svptrue_b32(), v317, v325); + svfloat32_t v334 = svsub_f32_x(svptrue_b32(), v318, v327); + svfloat32_t v335 = svsub_f32_x(svptrue_b32(), v319, v326); + svst1_f64(pred_full, (double *)(v636), svreinterpret_f64_f32(v301)); + svst1_f64(pred_full, (double *)(v672), svreinterpret_f64_f32(v300)); + svst1_f64(pred_full, (double *)(v708), svreinterpret_f64_f32(v299)); + svst1_f64(pred_full, (double *)(v744), svreinterpret_f64_f32(v298)); + svst1_f64(pred_full, (double *)(v627), svreinterpret_f64_f32(v331)); + svst1_f64(pred_full, (double *)(v645), svreinterpret_f64_f32(v334)); + svst1_f64(pred_full, (double *)(v663), svreinterpret_f64_f32(v335)); + svst1_f64(pred_full, (double *)(v681), svreinterpret_f64_f32(v330)); + svst1_f64(pred_full, (double *)(v699), svreinterpret_f64_f32(v329)); + svst1_f64(pred_full, (double *)(v717), svreinterpret_f64_f32(v332)); + svst1_f64(pred_full, (double *)(v735), svreinterpret_f64_f32(v333)); + svst1_f64(pred_full, (double *)(v753), svreinterpret_f64_f32(v328)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu17(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v20 = vld1s_s16(&v5[istride]); + float v190 = -4.2602849117736000e-02F; + float v194 = 2.0497965023262180e-01F; + float v198 = 1.0451835201736759e+00F; + float v202 = 1.7645848660222969e+00F; + float v206 = -7.2340797728605655e-01F; + float v210 = -8.9055591620606403e-02F; + float v214 = -1.0625000000000000e+00F; + float v218 = 2.5769410160110379e-01F; + float v222 = 7.7980260789483757e-01F; + float v226 = 5.4389318464570580e-01F; + float v230 = 4.2010193497052700e-01F; + float v234 = 1.2810929434228073e+00F; + float v238 = 4.4088907348175338e-01F; + float v242 = 3.1717619283272508e-01F; + float v245 = -9.0138318648016680e-01F; + float v246 = 9.0138318648016680e-01F; + float v252 = -4.3248756360072310e-01F; + float v253 = 4.3248756360072310e-01F; + float v259 = 6.6693537504044498e-01F; + float v260 = -6.6693537504044498e-01F; + float v266 = -6.0389004312516970e-01F; + float v267 = 6.0389004312516970e-01F; + float v273 = -3.6924873198582547e-01F; + float v274 = 3.6924873198582547e-01F; + float v280 = 4.8656938755549761e-01F; + float v281 = -4.8656938755549761e-01F; + float v287 = 2.3813712136760609e-01F; + float v288 = -2.3813712136760609e-01F; + float v294 = -1.5573820617422458e+00F; + float v295 = 1.5573820617422458e+00F; + float v301 = 6.5962247018731990e-01F; + float v302 = -6.5962247018731990e-01F; + float v308 = -1.4316961569866241e-01F; + float v309 = 1.4316961569866241e-01F; + float v315 = 2.3903469959860771e-01F; + float v316 = -2.3903469959860771e-01F; + float v322 = -4.7932541949972603e-02F; + float v323 = 4.7932541949972603e-02F; + float v329 = -2.3188014856550065e+00F; + float v330 = 2.3188014856550065e+00F; + float v336 = 7.8914568419206255e-01F; + float v337 = -7.8914568419206255e-01F; + float v343 = 3.8484572871179505e+00F; + float v344 = -3.8484572871179505e+00F; + float v350 = -1.3003804568801376e+00F; + float v351 = 1.3003804568801376e+00F; + float v357 = 4.0814769046889037e+00F; + float v358 = -4.0814769046889037e+00F; + float v364 = -1.4807159909286283e+00F; + float v365 = 1.4807159909286283e+00F; + float v371 = -1.3332470363551400e-02F; + float v372 = 1.3332470363551400e-02F; + float v378 = -3.7139778690557629e-01F; + float v379 = 3.7139778690557629e-01F; + float v385 = 1.9236512863456379e-01F; + float v386 = -1.9236512863456379e-01F; + float32x2_t v388 = (float32x2_t){v4, v4}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v182 = vld1s_s16(&v5[0]); + float32x2_t v191 = (float32x2_t){v190, v190}; + float32x2_t v195 = (float32x2_t){v194, v194}; + float32x2_t v199 = (float32x2_t){v198, v198}; + float32x2_t v203 = (float32x2_t){v202, v202}; + float32x2_t v207 = (float32x2_t){v206, v206}; + float32x2_t v211 = (float32x2_t){v210, v210}; + float32x2_t v215 = (float32x2_t){v214, v214}; + float32x2_t v219 = (float32x2_t){v218, v218}; + float32x2_t v223 = (float32x2_t){v222, v222}; + float32x2_t v227 = (float32x2_t){v226, v226}; + float32x2_t v231 = (float32x2_t){v230, v230}; + float32x2_t v235 = (float32x2_t){v234, v234}; + float32x2_t v239 = (float32x2_t){v238, v238}; + float32x2_t v243 = (float32x2_t){v242, v242}; + float32x2_t v247 = (float32x2_t){v245, v246}; + float32x2_t v254 = (float32x2_t){v252, v253}; + float32x2_t v261 = (float32x2_t){v259, v260}; + float32x2_t v268 = (float32x2_t){v266, v267}; + float32x2_t v275 = (float32x2_t){v273, v274}; + float32x2_t v282 = (float32x2_t){v280, v281}; + float32x2_t v289 = (float32x2_t){v287, v288}; + float32x2_t v296 = (float32x2_t){v294, v295}; + float32x2_t v303 = (float32x2_t){v301, v302}; + float32x2_t v310 = (float32x2_t){v308, v309}; + float32x2_t v317 = (float32x2_t){v315, v316}; + float32x2_t v324 = (float32x2_t){v322, v323}; + float32x2_t v331 = (float32x2_t){v329, v330}; + float32x2_t v338 = (float32x2_t){v336, v337}; + float32x2_t v345 = (float32x2_t){v343, v344}; + float32x2_t v352 = (float32x2_t){v350, v351}; + float32x2_t v359 = (float32x2_t){v357, v358}; + float32x2_t v366 = (float32x2_t){v364, v365}; + float32x2_t v373 = (float32x2_t){v371, v372}; + float32x2_t v380 = (float32x2_t){v378, v379}; + float32x2_t v387 = (float32x2_t){v385, v386}; + int16x4_t v26 = vld1s_s16(&v5[istride * 16]); + int16x4_t v34 = vld1s_s16(&v5[istride * 3]); + int16x4_t v40 = vld1s_s16(&v5[istride * 14]); + int16x4_t v48 = vld1s_s16(&v5[istride * 9]); + int16x4_t v54 = vld1s_s16(&v5[istride * 8]); + int16x4_t v62 = vld1s_s16(&v5[istride * 10]); + int16x4_t v68 = vld1s_s16(&v5[istride * 7]); + int16x4_t v76 = vld1s_s16(&v5[istride * 13]); + int16x4_t v82 = vld1s_s16(&v5[istride * 4]); + int16x4_t v90 = vld1s_s16(&v5[istride * 5]); + int16x4_t v96 = vld1s_s16(&v5[istride * 12]); + int16x4_t v104 = vld1s_s16(&v5[istride * 15]); + int16x4_t v110 = vld1s_s16(&v5[istride * 2]); + int16x4_t v118 = vld1s_s16(&v5[istride * 11]); + int16x4_t v124 = vld1s_s16(&v5[istride * 6]); + float32x2_t v183 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v182)), 15); + float32x2_t v249 = vmul_f32(v388, v247); + float32x2_t v256 = vmul_f32(v388, v254); + float32x2_t v263 = vmul_f32(v388, v261); + float32x2_t v270 = vmul_f32(v388, v268); + float32x2_t v277 = vmul_f32(v388, v275); + float32x2_t v284 = vmul_f32(v388, v282); + float32x2_t v291 = vmul_f32(v388, v289); + float32x2_t v298 = vmul_f32(v388, v296); + float32x2_t v305 = vmul_f32(v388, v303); + float32x2_t v312 = vmul_f32(v388, v310); + float32x2_t v319 = vmul_f32(v388, v317); + float32x2_t v326 = vmul_f32(v388, v324); + float32x2_t v333 = vmul_f32(v388, v331); + float32x2_t v340 = vmul_f32(v388, v338); + float32x2_t v347 = vmul_f32(v388, v345); + float32x2_t v354 = vmul_f32(v388, v352); + float32x2_t v361 = vmul_f32(v388, v359); + float32x2_t v368 = vmul_f32(v388, v366); + float32x2_t v375 = vmul_f32(v388, v373); + float32x2_t v382 = vmul_f32(v388, v380); + float32x2_t v389 = vmul_f32(v388, v387); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v49 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v48)), 15); + float32x2_t v55 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v54)), 15); + float32x2_t v63 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v62)), 15); + float32x2_t v69 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v68)), 15); + float32x2_t v77 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v76)), 15); + float32x2_t v83 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v82)), 15); + float32x2_t v91 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v90)), 15); + float32x2_t v97 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v96)), 15); + float32x2_t v105 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v104)), 15); + float32x2_t v111 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v110)), 15); + float32x2_t v119 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v118)), 15); + float32x2_t v125 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v124)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v56 = vadd_f32(v49, v55); + float32x2_t v57 = vsub_f32(v49, v55); + float32x2_t v70 = vadd_f32(v63, v69); + float32x2_t v71 = vsub_f32(v63, v69); + float32x2_t v84 = vadd_f32(v77, v83); + float32x2_t v85 = vsub_f32(v77, v83); + float32x2_t v98 = vadd_f32(v91, v97); + float32x2_t v99 = vsub_f32(v91, v97); + float32x2_t v112 = vadd_f32(v105, v111); + float32x2_t v113 = vsub_f32(v105, v111); + float32x2_t v126 = vadd_f32(v119, v125); + float32x2_t v127 = vsub_f32(v119, v125); + float32x2_t v128 = vadd_f32(v28, v84); + float32x2_t v129 = vadd_f32(v42, v98); + float32x2_t v130 = vadd_f32(v56, v112); + float32x2_t v131 = vadd_f32(v70, v126); + float32x2_t v134 = vsub_f32(v28, v84); + float32x2_t v135 = vsub_f32(v42, v98); + float32x2_t v136 = vsub_f32(v56, v112); + float32x2_t v137 = vsub_f32(v70, v126); + float32x2_t v148 = vadd_f32(v29, v57); + float32x2_t v149 = vadd_f32(v43, v71); + float32x2_t v150 = vsub_f32(v29, v57); + float32x2_t v151 = vsub_f32(v127, v99); + float32x2_t v152 = vadd_f32(v85, v113); + float32x2_t v153 = vadd_f32(v99, v127); + float32x2_t v154 = vsub_f32(v85, v113); + float32x2_t v155 = vsub_f32(v43, v71); + float32x2_t v168 = vadd_f32(v29, v85); + float32x2_t v169 = vadd_f32(v71, v127); + float32x2_t v341 = vrev64_f32(v29); + float32x2_t v348 = vrev64_f32(v85); + float32x2_t v362 = vrev64_f32(v71); + float32x2_t v369 = vrev64_f32(v127); + float32x2_t v132 = vadd_f32(v128, v130); + float32x2_t v133 = vadd_f32(v129, v131); + float32x2_t v138 = vsub_f32(v128, v130); + float32x2_t v139 = vsub_f32(v129, v131); + float32x2_t v142 = vadd_f32(v135, v137); + float32x2_t v143 = vadd_f32(v134, v136); + float32x2_t v145 = vsub_f32(v136, v137); + float32x2_t v146 = vsub_f32(v134, v135); + float32x2_t v156 = vadd_f32(v148, v149); + float32x2_t v157 = vadd_f32(v152, v153); + float32x2_t v159 = vsub_f32(v148, v149); + float32x2_t v160 = vsub_f32(v152, v153); + float32x2_t v162 = vadd_f32(v150, v151); + float32x2_t v163 = vadd_f32(v154, v155); + float32x2_t v165 = vsub_f32(v150, v151); + float32x2_t v166 = vsub_f32(v154, v155); + float32x2_t v192 = vmul_f32(v134, v191); + float32x2_t v196 = vmul_f32(v135, v195); + float32x2_t v200 = vmul_f32(v136, v199); + float32x2_t v204 = vmul_f32(v137, v203); + float32x2_t v334 = vrev64_f32(v168); + float32x2_t v342 = vmul_f32(v341, v340); + float32x2_t v349 = vmul_f32(v348, v347); + float32x2_t v355 = vrev64_f32(v169); + float32x2_t v363 = vmul_f32(v362, v361); + float32x2_t v370 = vmul_f32(v369, v368); + float32x2_t v140 = vadd_f32(v132, v133); + float32x2_t v141 = vsub_f32(v132, v133); + float32x2_t v144 = vsub_f32(v143, v142); + float32x2_t v147 = vadd_f32(v138, v139); + float32x2_t v158 = vadd_f32(v156, v157); + float32x2_t v161 = vadd_f32(v159, v160); + float32x2_t v164 = vadd_f32(v162, v163); + float32x2_t v167 = vadd_f32(v165, v166); + float32x2_t v170 = vsub_f32(v163, v157); + float32x2_t v173 = vsub_f32(v156, v162); + float32x2_t v208 = vmul_f32(v138, v207); + float32x2_t v212 = vmul_f32(v139, v211); + float32x2_t v224 = vmul_f32(v142, v223); + float32x2_t v228 = vmul_f32(v143, v227); + float32x2_t v236 = vmul_f32(v145, v235); + float32x2_t v240 = vmul_f32(v146, v239); + float32x2_t v250 = vrev64_f32(v156); + float32x2_t v257 = vrev64_f32(v157); + float32x2_t v271 = vrev64_f32(v159); + float32x2_t v278 = vrev64_f32(v160); + float32x2_t v292 = vrev64_f32(v162); + float32x2_t v299 = vrev64_f32(v163); + float32x2_t v313 = vrev64_f32(v165); + float32x2_t v320 = vrev64_f32(v166); + float32x2_t v335 = vmul_f32(v334, v333); + float32x2_t v356 = vmul_f32(v355, v354); + float32x2_t v171 = vadd_f32(v170, v29); + float32x2_t v174 = vadd_f32(v173, v71); + float32x2_t v184 = vadd_f32(v183, v140); + float32x2_t v216 = vmul_f32(v140, v215); + float32x2_t v220 = vmul_f32(v141, v219); + float32x2_t v232 = vmul_f32(v144, v231); + float32x2_t v244 = vmul_f32(v147, v243); + float32x2_t v251 = vmul_f32(v250, v249); + float32x2_t v258 = vmul_f32(v257, v256); + float32x2_t v264 = vrev64_f32(v158); + float32x2_t v272 = vmul_f32(v271, v270); + float32x2_t v279 = vmul_f32(v278, v277); + float32x2_t v285 = vrev64_f32(v161); + float32x2_t v293 = vmul_f32(v292, v291); + float32x2_t v300 = vmul_f32(v299, v298); + float32x2_t v306 = vrev64_f32(v164); + float32x2_t v314 = vmul_f32(v313, v312); + float32x2_t v321 = vmul_f32(v320, v319); + float32x2_t v327 = vrev64_f32(v167); + float32x2_t v394 = vadd_f32(v204, v236); + float32x2_t v395 = vsub_f32(v236, v200); + float32x2_t v396 = vadd_f32(v196, v240); + float32x2_t v397 = vsub_f32(v192, v240); + float32x2_t v172 = vsub_f32(v171, v169); + float32x2_t v175 = vadd_f32(v174, v85); + float32x2_t v265 = vmul_f32(v264, v263); + float32x2_t v286 = vmul_f32(v285, v284); + float32x2_t v307 = vmul_f32(v306, v305); + float32x2_t v328 = vmul_f32(v327, v326); + float32x2_t v392 = vadd_f32(v224, v232); + float32x2_t v393 = vsub_f32(v228, v232); + float32x2_t v398 = vsub_f32(v244, v212); + float32x2_t v399 = vadd_f32(v244, v208); + float32x2_t v400 = vadd_f32(v216, v184); + v6[0] = v184; + float32x2_t v176 = vsub_f32(v175, v127); + float32x2_t v376 = vrev64_f32(v172); + float32x2_t v401 = vadd_f32(v220, v400); + float32x2_t v402 = vsub_f32(v400, v220); + float32x2_t v403 = vsub_f32(v392, v394); + float32x2_t v405 = vadd_f32(v393, v395); + float32x2_t v407 = vadd_f32(v392, v396); + float32x2_t v409 = vadd_f32(v393, v397); + float32x2_t v419 = vadd_f32(v251, v265); + float32x2_t v420 = vadd_f32(v258, v265); + float32x2_t v421 = vadd_f32(v272, v286); + float32x2_t v422 = vadd_f32(v279, v286); + float32x2_t v423 = vadd_f32(v293, v307); + float32x2_t v424 = vadd_f32(v300, v307); + float32x2_t v425 = vadd_f32(v314, v328); + float32x2_t v426 = vadd_f32(v321, v328); + float32x2_t v177 = vadd_f32(v172, v176); + float32x2_t v377 = vmul_f32(v376, v375); + float32x2_t v383 = vrev64_f32(v176); + float32x2_t v404 = vadd_f32(v398, v401); + float32x2_t v406 = vadd_f32(v399, v402); + float32x2_t v408 = vsub_f32(v401, v398); + float32x2_t v410 = vsub_f32(v402, v399); + float32x2_t v430 = vadd_f32(v419, v421); + float32x2_t v431 = vsub_f32(v419, v421); + float32x2_t v432 = vadd_f32(v420, v422); + float32x2_t v433 = vsub_f32(v420, v422); + float32x2_t v434 = vadd_f32(v423, v425); + float32x2_t v435 = vsub_f32(v425, v423); + float32x2_t v436 = vadd_f32(v424, v426); + float32x2_t v437 = vsub_f32(v426, v424); + float32x2_t v384 = vmul_f32(v383, v382); + float32x2_t v390 = vrev64_f32(v177); + float32x2_t v411 = vadd_f32(v403, v404); + float32x2_t v412 = vadd_f32(v405, v406); + float32x2_t v413 = vadd_f32(v407, v408); + float32x2_t v414 = vadd_f32(v409, v410); + float32x2_t v415 = vsub_f32(v404, v403); + float32x2_t v416 = vsub_f32(v406, v405); + float32x2_t v417 = vsub_f32(v408, v407); + float32x2_t v418 = vsub_f32(v410, v409); + float32x2_t v447 = vadd_f32(v432, v436); + float32x2_t v449 = vadd_f32(v431, v437); + float32x2_t v451 = vsub_f32(v430, v434); + float32x2_t v453 = vsub_f32(v437, v431); + float32x2_t v455 = vadd_f32(v430, v434); + float32x2_t v458 = vsub_f32(v435, v433); + float32x2_t v461 = vsub_f32(v436, v432); + float32x2_t v464 = vadd_f32(v433, v435); + float32x2_t v391 = vmul_f32(v390, v389); + float32x2_t v438 = vsub_f32(v377, v384); + float32x2_t v427 = vadd_f32(v391, v384); + float32x2_t v440 = vadd_f32(v438, v438); + float32x2_t v465 = vsub_f32(v464, v438); + float32x2_t v428 = vadd_f32(v335, v427); + float32x2_t v441 = vsub_f32(v356, v440); + float32x2_t v444 = vadd_f32(v427, v427); + float32x2_t v462 = vadd_f32(v461, v440); + float32x2_t v495 = vadd_f32(v418, v465); + float32x2_t v501 = vsub_f32(v418, v465); + float32x2_t v429 = vadd_f32(v428, v342); + float32x2_t v439 = vadd_f32(v428, v349); + float32x2_t v442 = vadd_f32(v441, v363); + float32x2_t v443 = vadd_f32(v441, v370); + float32x2_t v445 = vadd_f32(v444, v444); + float32x2_t v446 = vadd_f32(v438, v444); + float32x2_t v452 = vadd_f32(v451, v444); + float32x2_t v463 = vadd_f32(v462, v444); + v6[ostride * 3] = v495; + v6[ostride * 14] = v501; + float32x2_t v448 = vadd_f32(v447, v439); + float32x2_t v450 = vadd_f32(v449, v442); + float32x2_t v454 = vsub_f32(v453, v446); + float32x2_t v456 = vadd_f32(v455, v429); + float32x2_t v459 = vsub_f32(v458, v443); + float32x2_t v483 = vadd_f32(v413, v452); + float32x2_t v489 = vsub_f32(v413, v452); + float32x2_t v555 = vadd_f32(v417, v463); + float32x2_t v561 = vsub_f32(v417, v463); + float32x2_t v457 = vadd_f32(v456, v438); + float32x2_t v460 = vadd_f32(v459, v445); + float32x2_t v471 = vadd_f32(v411, v448); + float32x2_t v477 = vsub_f32(v411, v448); + v6[ostride * 2] = v483; + v6[ostride * 15] = v489; + float32x2_t v519 = vadd_f32(v414, v454); + float32x2_t v525 = vsub_f32(v414, v454); + float32x2_t v531 = vadd_f32(v412, v450); + float32x2_t v537 = vsub_f32(v412, v450); + v6[ostride * 8] = v555; + v6[ostride * 9] = v561; + v6[ostride] = v471; + v6[ostride * 16] = v477; + float32x2_t v507 = vadd_f32(v415, v457); + float32x2_t v513 = vsub_f32(v415, v457); + v6[ostride * 5] = v519; + v6[ostride * 12] = v525; + v6[ostride * 6] = v531; + v6[ostride * 11] = v537; + float32x2_t v543 = vadd_f32(v416, v460); + float32x2_t v549 = vsub_f32(v416, v460); + v6[ostride * 4] = v507; + v6[ostride * 13] = v513; + v6[ostride * 7] = v543; + v6[ostride * 10] = v549; + v5 += 1 * idist; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu17(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v227 = -4.2602849117736000e-02F; + float v232 = 2.0497965023262180e-01F; + float v237 = 1.0451835201736759e+00F; + float v242 = 1.7645848660222969e+00F; + float v247 = -7.2340797728605655e-01F; + float v252 = -8.9055591620606403e-02F; + float v257 = -1.0625000000000000e+00F; + float v262 = 2.5769410160110379e-01F; + float v267 = 7.7980260789483757e-01F; + float v272 = 5.4389318464570580e-01F; + float v277 = 4.2010193497052700e-01F; + float v282 = 1.2810929434228073e+00F; + float v287 = 4.4088907348175338e-01F; + float v292 = 3.1717619283272508e-01F; + float v297 = 9.0138318648016680e-01F; + float v304 = 4.3248756360072310e-01F; + float v311 = -6.6693537504044498e-01F; + float v318 = 6.0389004312516970e-01F; + float v325 = 3.6924873198582547e-01F; + float v332 = -4.8656938755549761e-01F; + float v339 = -2.3813712136760609e-01F; + float v346 = 1.5573820617422458e+00F; + float v353 = -6.5962247018731990e-01F; + float v360 = 1.4316961569866241e-01F; + float v367 = -2.3903469959860771e-01F; + float v374 = 4.7932541949972603e-02F; + float v381 = 2.3188014856550065e+00F; + float v388 = -7.8914568419206255e-01F; + float v395 = -3.8484572871179505e+00F; + float v402 = 1.3003804568801376e+00F; + float v409 = -4.0814769046889037e+00F; + float v416 = 1.4807159909286283e+00F; + float v423 = 1.3332470363551400e-02F; + float v430 = 3.7139778690557629e-01F; + float v437 = -1.9236512863456379e-01F; + const int32_t *v658 = &v5[v0]; + float32x2_t *v858 = &v6[v2]; + int64_t v27 = v0 * 16; + int64_t v37 = v0 * 3; + int64_t v45 = v0 * 14; + int64_t v55 = v0 * 9; + int64_t v63 = v0 * 8; + int64_t v73 = v0 * 10; + int64_t v81 = v0 * 7; + int64_t v91 = v0 * 13; + int64_t v99 = v0 * 4; + int64_t v109 = v0 * 5; + int64_t v117 = v0 * 12; + int64_t v127 = v0 * 15; + int64_t v135 = v0 * 2; + int64_t v145 = v0 * 11; + int64_t v153 = v0 * 6; + float v300 = v4 * v297; + float v307 = v4 * v304; + float v314 = v4 * v311; + float v321 = v4 * v318; + float v328 = v4 * v325; + float v335 = v4 * v332; + float v342 = v4 * v339; + float v349 = v4 * v346; + float v356 = v4 * v353; + float v363 = v4 * v360; + float v370 = v4 * v367; + float v377 = v4 * v374; + float v384 = v4 * v381; + float v391 = v4 * v388; + float v398 = v4 * v395; + float v405 = v4 * v402; + float v412 = v4 * v409; + float v419 = v4 * v416; + float v426 = v4 * v423; + float v433 = v4 * v430; + float v440 = v4 * v437; + int64_t v534 = v2 * 16; + int64_t v542 = v2 * 2; + int64_t v550 = v2 * 15; + int64_t v558 = v2 * 3; + int64_t v566 = v2 * 14; + int64_t v574 = v2 * 4; + int64_t v582 = v2 * 13; + int64_t v590 = v2 * 5; + int64_t v598 = v2 * 12; + int64_t v606 = v2 * 6; + int64_t v614 = v2 * 11; + int64_t v622 = v2 * 7; + int64_t v630 = v2 * 10; + int64_t v638 = v2 * 8; + int64_t v646 = v2 * 9; + const int32_t *v803 = &v5[0]; + svint64_t v804 = svindex_s64(0, v1); + svfloat32_t v807 = svdup_n_f32(v227); + svfloat32_t v808 = svdup_n_f32(v232); + svfloat32_t v809 = svdup_n_f32(v237); + svfloat32_t v810 = svdup_n_f32(v242); + svfloat32_t v811 = svdup_n_f32(v247); + svfloat32_t v812 = svdup_n_f32(v252); + svfloat32_t v813 = svdup_n_f32(v257); + svfloat32_t v814 = svdup_n_f32(v262); + svfloat32_t v815 = svdup_n_f32(v267); + svfloat32_t v816 = svdup_n_f32(v272); + svfloat32_t v817 = svdup_n_f32(v277); + svfloat32_t v818 = svdup_n_f32(v282); + svfloat32_t v819 = svdup_n_f32(v287); + svfloat32_t v820 = svdup_n_f32(v292); + float32x2_t *v849 = &v6[0]; + svint16_t v660 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v658), v804)); + const int32_t *v667 = &v5[v27]; + const int32_t *v676 = &v5[v37]; + const int32_t *v685 = &v5[v45]; + const int32_t *v694 = &v5[v55]; + const int32_t *v703 = &v5[v63]; + const int32_t *v712 = &v5[v73]; + const int32_t *v721 = &v5[v81]; + const int32_t *v730 = &v5[v91]; + const int32_t *v739 = &v5[v99]; + const int32_t *v748 = &v5[v109]; + const int32_t *v757 = &v5[v117]; + const int32_t *v766 = &v5[v127]; + const int32_t *v775 = &v5[v135]; + const int32_t *v784 = &v5[v145]; + const int32_t *v793 = &v5[v153]; + svint16_t v805 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v803), v804)); + svfloat32_t v821 = svdup_n_f32(v300); + svfloat32_t v822 = svdup_n_f32(v307); + svfloat32_t v823 = svdup_n_f32(v314); + svfloat32_t v824 = svdup_n_f32(v321); + svfloat32_t v825 = svdup_n_f32(v328); + svfloat32_t v826 = svdup_n_f32(v335); + svfloat32_t v827 = svdup_n_f32(v342); + svfloat32_t v828 = svdup_n_f32(v349); + svfloat32_t v829 = svdup_n_f32(v356); + svfloat32_t v830 = svdup_n_f32(v363); + svfloat32_t v831 = svdup_n_f32(v370); + svfloat32_t v832 = svdup_n_f32(v377); + svfloat32_t v833 = svdup_n_f32(v384); + svfloat32_t v834 = svdup_n_f32(v391); + svfloat32_t v835 = svdup_n_f32(v398); + svfloat32_t v836 = svdup_n_f32(v405); + svfloat32_t v837 = svdup_n_f32(v412); + svfloat32_t v838 = svdup_n_f32(v419); + svfloat32_t v839 = svdup_n_f32(v426); + svfloat32_t v840 = svdup_n_f32(v433); + svfloat32_t v841 = svdup_n_f32(v440); + float32x2_t *v867 = &v6[v534]; + float32x2_t *v876 = &v6[v542]; + float32x2_t *v885 = &v6[v550]; + float32x2_t *v894 = &v6[v558]; + float32x2_t *v903 = &v6[v566]; + float32x2_t *v912 = &v6[v574]; + float32x2_t *v921 = &v6[v582]; + float32x2_t *v930 = &v6[v590]; + float32x2_t *v939 = &v6[v598]; + float32x2_t *v948 = &v6[v606]; + float32x2_t *v957 = &v6[v614]; + float32x2_t *v966 = &v6[v622]; + float32x2_t *v975 = &v6[v630]; + float32x2_t *v984 = &v6[v638]; + float32x2_t *v993 = &v6[v646]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v660, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v219 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v805, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svint16_t v669 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v667), v804)); + svint16_t v678 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v676), v804)); + svint16_t v687 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v685), v804)); + svint16_t v696 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v694), v804)); + svint16_t v705 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v703), v804)); + svint16_t v714 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v712), v804)); + svint16_t v723 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v721), v804)); + svint16_t v732 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v730), v804)); + svint16_t v741 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v739), v804)); + svint16_t v750 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v748), v804)); + svint16_t v759 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v757), v804)); + svint16_t v768 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v766), v804)); + svint16_t v777 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v775), v804)); + svint16_t v786 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v784), v804)); + svint16_t v795 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v793), v804)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v669, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v678, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v51 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v687, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v61 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v696, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v69 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v705, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v79 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v714, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v87 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v723, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v97 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v732, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v105 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v741, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v115 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v750, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v123 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v759, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v133 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v768, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v141 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v777, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v151 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v786, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v159 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v795, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v70 = svadd_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v71 = svsub_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v106 = svadd_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v151, v159); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v151, v159); + svfloat32_t v162 = svadd_f32_x(svptrue_b32(), v34, v106); + svfloat32_t v163 = svadd_f32_x(svptrue_b32(), v52, v124); + svfloat32_t v164 = svadd_f32_x(svptrue_b32(), v70, v142); + svfloat32_t v165 = svadd_f32_x(svptrue_b32(), v88, v160); + svfloat32_t v168 = svsub_f32_x(svptrue_b32(), v34, v106); + svfloat32_t v169 = svsub_f32_x(svptrue_b32(), v52, v124); + svfloat32_t v170 = svsub_f32_x(svptrue_b32(), v70, v142); + svfloat32_t v171 = svsub_f32_x(svptrue_b32(), v88, v160); + svfloat32_t v182 = svadd_f32_x(svptrue_b32(), v35, v71); + svfloat32_t v183 = svadd_f32_x(svptrue_b32(), v53, v89); + svfloat32_t v184 = svsub_f32_x(svptrue_b32(), v35, v71); + svfloat32_t v185 = svsub_f32_x(svptrue_b32(), v161, v125); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v107, v143); + svfloat32_t v187 = svadd_f32_x(svptrue_b32(), v125, v161); + svfloat32_t v188 = svsub_f32_x(svptrue_b32(), v107, v143); + svfloat32_t v189 = svsub_f32_x(svptrue_b32(), v53, v89); + svfloat32_t v202 = svadd_f32_x(svptrue_b32(), v35, v107); + svfloat32_t v203 = svadd_f32_x(svptrue_b32(), v89, v161); + svfloat32_t v166 = svadd_f32_x(svptrue_b32(), v162, v164); + svfloat32_t v167 = svadd_f32_x(svptrue_b32(), v163, v165); + svfloat32_t v172 = svsub_f32_x(svptrue_b32(), v162, v164); + svfloat32_t v173 = svsub_f32_x(svptrue_b32(), v163, v165); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v169, v171); + svfloat32_t v177 = svadd_f32_x(svptrue_b32(), v168, v170); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v170, v171); + svfloat32_t v180 = svsub_f32_x(svptrue_b32(), v168, v169); + svfloat32_t v190 = svadd_f32_x(svptrue_b32(), v182, v183); + svfloat32_t v191 = svadd_f32_x(svptrue_b32(), v186, v187); + svfloat32_t v193 = svsub_f32_x(svptrue_b32(), v182, v183); + svfloat32_t v194 = svsub_f32_x(svptrue_b32(), v186, v187); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v184, v185); + svfloat32_t v197 = svadd_f32_x(svptrue_b32(), v188, v189); + svfloat32_t v199 = svsub_f32_x(svptrue_b32(), v184, v185); + svfloat32_t v200 = svsub_f32_x(svptrue_b32(), v188, v189); + svfloat32_t v240 = svmul_f32_x(svptrue_b32(), v170, v809); + svfloat32_t zero407 = svdup_n_f32(0); + svfloat32_t v407 = svcmla_f32_x(pred_full, zero407, v836, v203, 90); + svfloat32_t v174 = svadd_f32_x(svptrue_b32(), v166, v167); + svfloat32_t v175 = svsub_f32_x(svptrue_b32(), v166, v167); + svfloat32_t v178 = svsub_f32_x(svptrue_b32(), v177, v176); + svfloat32_t v181 = svadd_f32_x(svptrue_b32(), v172, v173); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v190, v191); + svfloat32_t v195 = svadd_f32_x(svptrue_b32(), v193, v194); + svfloat32_t v198 = svadd_f32_x(svptrue_b32(), v196, v197); + svfloat32_t v201 = svadd_f32_x(svptrue_b32(), v199, v200); + svfloat32_t v204 = svsub_f32_x(svptrue_b32(), v197, v191); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v190, v196); + svfloat32_t v250 = svmul_f32_x(svptrue_b32(), v172, v811); + svfloat32_t v255 = svmul_f32_x(svptrue_b32(), v173, v812); + svfloat32_t v285 = svmul_f32_x(svptrue_b32(), v179, v818); + svfloat32_t v290 = svmul_f32_x(svptrue_b32(), v180, v819); + svfloat32_t v205 = svadd_f32_x(svptrue_b32(), v204, v35); + svfloat32_t v208 = svadd_f32_x(svptrue_b32(), v207, v89); + svfloat32_t v220 = svadd_f32_x(svptrue_b32(), v219, v174); + svfloat32_t v280 = svmul_f32_x(svptrue_b32(), v178, v817); + svfloat32_t zero316 = svdup_n_f32(0); + svfloat32_t v316 = svcmla_f32_x(pred_full, zero316, v823, v192, 90); + svfloat32_t zero337 = svdup_n_f32(0); + svfloat32_t v337 = svcmla_f32_x(pred_full, zero337, v826, v195, 90); + svfloat32_t zero358 = svdup_n_f32(0); + svfloat32_t v358 = svcmla_f32_x(pred_full, zero358, v829, v198, 90); + svfloat32_t zero379 = svdup_n_f32(0); + svfloat32_t v379 = svcmla_f32_x(pred_full, zero379, v832, v201, 90); + svfloat32_t v445 = svmla_f32_x(pred_full, v285, v171, v810); + svfloat32_t v446 = svnmls_f32_x(pred_full, v240, v179, v818); + svfloat32_t v447 = svmla_f32_x(pred_full, v290, v169, v808); + svfloat32_t v448 = svnmls_f32_x(pred_full, v290, v168, v807); + svfloat32_t v206 = svsub_f32_x(svptrue_b32(), v205, v203); + svfloat32_t v209 = svadd_f32_x(svptrue_b32(), v208, v107); + svfloat32_t v443 = svmla_f32_x(pred_full, v280, v176, v815); + svfloat32_t v444 = svnmls_f32_x(pred_full, v280, v177, v816); + svfloat32_t v449 = svnmls_f32_x(pred_full, v255, v181, v820); + svfloat32_t v450 = svmla_f32_x(pred_full, v250, v181, v820); + svfloat32_t v451 = svmla_f32_x(pred_full, v220, v174, v813); + svfloat32_t v470 = svcmla_f32_x(pred_full, v316, v821, v190, 90); + svfloat32_t v471 = svcmla_f32_x(pred_full, v316, v822, v191, 90); + svfloat32_t v472 = svcmla_f32_x(pred_full, v337, v824, v193, 90); + svfloat32_t v473 = svcmla_f32_x(pred_full, v337, v825, v194, 90); + svfloat32_t v474 = svcmla_f32_x(pred_full, v358, v827, v196, 90); + svfloat32_t v475 = svcmla_f32_x(pred_full, v358, v828, v197, 90); + svfloat32_t v476 = svcmla_f32_x(pred_full, v379, v830, v199, 90); + svfloat32_t v477 = svcmla_f32_x(pred_full, v379, v831, v200, 90); + svst1_f64(pred_full, (double *)(v849), svreinterpret_f64_f32(v220)); + svfloat32_t v210 = svsub_f32_x(svptrue_b32(), v209, v161); + svfloat32_t zero428 = svdup_n_f32(0); + svfloat32_t v428 = svcmla_f32_x(pred_full, zero428, v839, v206, 90); + svfloat32_t v452 = svmla_f32_x(pred_full, v451, v175, v814); + svfloat32_t v453 = svmls_f32_x(pred_full, v451, v175, v814); + svfloat32_t v454 = svsub_f32_x(svptrue_b32(), v443, v445); + svfloat32_t v456 = svadd_f32_x(svptrue_b32(), v444, v446); + svfloat32_t v458 = svadd_f32_x(svptrue_b32(), v443, v447); + svfloat32_t v460 = svadd_f32_x(svptrue_b32(), v444, v448); + svfloat32_t v481 = svadd_f32_x(svptrue_b32(), v470, v472); + svfloat32_t v482 = svsub_f32_x(svptrue_b32(), v470, v472); + svfloat32_t v483 = svadd_f32_x(svptrue_b32(), v471, v473); + svfloat32_t v484 = svsub_f32_x(svptrue_b32(), v471, v473); + svfloat32_t v485 = svadd_f32_x(svptrue_b32(), v474, v476); + svfloat32_t v486 = svsub_f32_x(svptrue_b32(), v476, v474); + svfloat32_t v487 = svadd_f32_x(svptrue_b32(), v475, v477); + svfloat32_t v488 = svsub_f32_x(svptrue_b32(), v477, v475); + svfloat32_t v211 = svadd_f32_x(svptrue_b32(), v206, v210); + svfloat32_t zero435 = svdup_n_f32(0); + svfloat32_t v435 = svcmla_f32_x(pred_full, zero435, v840, v210, 90); + svfloat32_t v455 = svadd_f32_x(svptrue_b32(), v449, v452); + svfloat32_t v457 = svadd_f32_x(svptrue_b32(), v450, v453); + svfloat32_t v459 = svsub_f32_x(svptrue_b32(), v452, v449); + svfloat32_t v461 = svsub_f32_x(svptrue_b32(), v453, v450); + svfloat32_t v498 = svadd_f32_x(svptrue_b32(), v483, v487); + svfloat32_t v500 = svadd_f32_x(svptrue_b32(), v482, v488); + svfloat32_t v502 = svsub_f32_x(svptrue_b32(), v481, v485); + svfloat32_t v504 = svsub_f32_x(svptrue_b32(), v488, v482); + svfloat32_t v506 = svadd_f32_x(svptrue_b32(), v481, v485); + svfloat32_t v509 = svsub_f32_x(svptrue_b32(), v486, v484); + svfloat32_t v512 = svsub_f32_x(svptrue_b32(), v487, v483); + svfloat32_t v515 = svadd_f32_x(svptrue_b32(), v484, v486); + svfloat32_t v462 = svadd_f32_x(svptrue_b32(), v454, v455); + svfloat32_t v463 = svadd_f32_x(svptrue_b32(), v456, v457); + svfloat32_t v464 = svadd_f32_x(svptrue_b32(), v458, v459); + svfloat32_t v465 = svadd_f32_x(svptrue_b32(), v460, v461); + svfloat32_t v466 = svsub_f32_x(svptrue_b32(), v455, v454); + svfloat32_t v467 = svsub_f32_x(svptrue_b32(), v457, v456); + svfloat32_t v468 = svsub_f32_x(svptrue_b32(), v459, v458); + svfloat32_t v469 = svsub_f32_x(svptrue_b32(), v461, v460); + svfloat32_t v489 = svsub_f32_x(svptrue_b32(), v428, v435); + svfloat32_t v478 = svcmla_f32_x(pred_full, v435, v841, v211, 90); + svfloat32_t v491 = svadd_f32_x(svptrue_b32(), v489, v489); + svfloat32_t v516 = svsub_f32_x(svptrue_b32(), v515, v489); + svfloat32_t v479 = svcmla_f32_x(pred_full, v478, v833, v202, 90); + svfloat32_t v492 = svsub_f32_x(svptrue_b32(), v407, v491); + svfloat32_t v495 = svadd_f32_x(svptrue_b32(), v478, v478); + svfloat32_t v513 = svadd_f32_x(svptrue_b32(), v512, v491); + svfloat32_t v556 = svadd_f32_x(svptrue_b32(), v469, v516); + svfloat32_t v564 = svsub_f32_x(svptrue_b32(), v469, v516); + svfloat32_t v480 = svcmla_f32_x(pred_full, v479, v834, v35, 90); + svfloat32_t v490 = svcmla_f32_x(pred_full, v479, v835, v107, 90); + svfloat32_t v493 = svcmla_f32_x(pred_full, v492, v837, v89, 90); + svfloat32_t v494 = svcmla_f32_x(pred_full, v492, v838, v161, 90); + svfloat32_t v496 = svadd_f32_x(svptrue_b32(), v495, v495); + svfloat32_t v497 = svadd_f32_x(svptrue_b32(), v489, v495); + svfloat32_t v503 = svadd_f32_x(svptrue_b32(), v502, v495); + svfloat32_t v514 = svadd_f32_x(svptrue_b32(), v513, v495); + svst1_f64(pred_full, (double *)(v894), svreinterpret_f64_f32(v556)); + svst1_f64(pred_full, (double *)(v903), svreinterpret_f64_f32(v564)); + svfloat32_t v499 = svadd_f32_x(svptrue_b32(), v498, v490); + svfloat32_t v501 = svadd_f32_x(svptrue_b32(), v500, v493); + svfloat32_t v505 = svsub_f32_x(svptrue_b32(), v504, v497); + svfloat32_t v507 = svadd_f32_x(svptrue_b32(), v506, v480); + svfloat32_t v510 = svsub_f32_x(svptrue_b32(), v509, v494); + svfloat32_t v540 = svadd_f32_x(svptrue_b32(), v464, v503); + svfloat32_t v548 = svsub_f32_x(svptrue_b32(), v464, v503); + svfloat32_t v636 = svadd_f32_x(svptrue_b32(), v468, v514); + svfloat32_t v644 = svsub_f32_x(svptrue_b32(), v468, v514); + svfloat32_t v508 = svadd_f32_x(svptrue_b32(), v507, v489); + svfloat32_t v511 = svadd_f32_x(svptrue_b32(), v510, v496); + svfloat32_t v524 = svadd_f32_x(svptrue_b32(), v462, v499); + svfloat32_t v532 = svsub_f32_x(svptrue_b32(), v462, v499); + svfloat32_t v588 = svadd_f32_x(svptrue_b32(), v465, v505); + svfloat32_t v596 = svsub_f32_x(svptrue_b32(), v465, v505); + svfloat32_t v604 = svadd_f32_x(svptrue_b32(), v463, v501); + svfloat32_t v612 = svsub_f32_x(svptrue_b32(), v463, v501); + svst1_f64(pred_full, (double *)(v876), svreinterpret_f64_f32(v540)); + svst1_f64(pred_full, (double *)(v885), svreinterpret_f64_f32(v548)); + svst1_f64(pred_full, (double *)(v984), svreinterpret_f64_f32(v636)); + svst1_f64(pred_full, (double *)(v993), svreinterpret_f64_f32(v644)); + svfloat32_t v572 = svadd_f32_x(svptrue_b32(), v466, v508); + svfloat32_t v580 = svsub_f32_x(svptrue_b32(), v466, v508); + svfloat32_t v620 = svadd_f32_x(svptrue_b32(), v467, v511); + svfloat32_t v628 = svsub_f32_x(svptrue_b32(), v467, v511); + svst1_f64(pred_full, (double *)(v858), svreinterpret_f64_f32(v524)); + svst1_f64(pred_full, (double *)(v867), svreinterpret_f64_f32(v532)); + svst1_f64(pred_full, (double *)(v930), svreinterpret_f64_f32(v588)); + svst1_f64(pred_full, (double *)(v939), svreinterpret_f64_f32(v596)); + svst1_f64(pred_full, (double *)(v948), svreinterpret_f64_f32(v604)); + svst1_f64(pred_full, (double *)(v957), svreinterpret_f64_f32(v612)); + svst1_f64(pred_full, (double *)(v912), svreinterpret_f64_f32(v572)); + svst1_f64(pred_full, (double *)(v921), svreinterpret_f64_f32(v580)); + svst1_f64(pred_full, (double *)(v966), svreinterpret_f64_f32(v620)); + svst1_f64(pred_full, (double *)(v975), svreinterpret_f64_f32(v628)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu18(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v96 = vld1s_s16(&v5[istride]); + float v271 = -5.0000000000000000e-01F; + float v282 = -1.4999999999999998e+00F; + float v285 = 8.6602540378443871e-01F; + float v286 = -8.6602540378443871e-01F; + float v293 = 7.6604444311897801e-01F; + float v297 = 9.3969262078590832e-01F; + float v301 = -1.7364817766693039e-01F; + float v304 = 6.4278760968653925e-01F; + float v305 = -6.4278760968653925e-01F; + float v311 = -3.4202014332566888e-01F; + float v312 = 3.4202014332566888e-01F; + float v318 = 9.8480775301220802e-01F; + float v319 = -9.8480775301220802e-01F; + float32x2_t v321 = (float32x2_t){v4, v4}; + int16x4_t v20 = vld1s_s16(&v5[0]); + float32x2_t v97 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v96)), 15); + float32x2_t v272 = (float32x2_t){v271, v271}; + float32x2_t v283 = (float32x2_t){v282, v282}; + float32x2_t v287 = (float32x2_t){v285, v286}; + float32x2_t v294 = (float32x2_t){v293, v293}; + float32x2_t v298 = (float32x2_t){v297, v297}; + float32x2_t v302 = (float32x2_t){v301, v301}; + float32x2_t v306 = (float32x2_t){v304, v305}; + float32x2_t v313 = (float32x2_t){v311, v312}; + float32x2_t v320 = (float32x2_t){v318, v319}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v26 = vld1s_s16(&v5[istride * 9]); + int16x4_t v34 = vld1s_s16(&v5[istride * 2]); + int16x4_t v40 = vld1s_s16(&v5[istride * 11]); + int16x4_t v48 = vld1s_s16(&v5[istride * 4]); + int16x4_t v54 = vld1s_s16(&v5[istride * 13]); + int16x4_t v62 = vld1s_s16(&v5[istride * 6]); + int16x4_t v68 = vld1s_s16(&v5[istride * 15]); + int16x4_t v76 = vld1s_s16(&v5[istride * 8]); + int16x4_t v82 = vld1s_s16(&v5[istride * 17]); + int16x4_t v90 = vld1s_s16(&v5[istride * 10]); + int16x4_t v104 = vld1s_s16(&v5[istride * 12]); + int16x4_t v110 = vld1s_s16(&v5[istride * 3]); + int16x4_t v118 = vld1s_s16(&v5[istride * 14]); + int16x4_t v124 = vld1s_s16(&v5[istride * 5]); + int16x4_t v132 = vld1s_s16(&v5[istride * 16]); + int16x4_t v138 = vld1s_s16(&v5[istride * 7]); + float32x2_t v289 = vmul_f32(v321, v287); + float32x2_t v308 = vmul_f32(v321, v306); + float32x2_t v315 = vmul_f32(v321, v313); + float32x2_t v322 = vmul_f32(v321, v320); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v49 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v48)), 15); + float32x2_t v55 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v54)), 15); + float32x2_t v63 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v62)), 15); + float32x2_t v69 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v68)), 15); + float32x2_t v77 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v76)), 15); + float32x2_t v83 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v82)), 15); + float32x2_t v91 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v90)), 15); + float32x2_t v105 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v104)), 15); + float32x2_t v111 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v110)), 15); + float32x2_t v119 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v118)), 15); + float32x2_t v125 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v124)), 15); + float32x2_t v133 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v132)), 15); + float32x2_t v139 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v138)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v56 = vadd_f32(v49, v55); + float32x2_t v57 = vsub_f32(v49, v55); + float32x2_t v70 = vadd_f32(v63, v69); + float32x2_t v71 = vsub_f32(v63, v69); + float32x2_t v84 = vadd_f32(v77, v83); + float32x2_t v85 = vsub_f32(v77, v83); + float32x2_t v98 = vadd_f32(v91, v97); + float32x2_t v99 = vsub_f32(v91, v97); + float32x2_t v112 = vadd_f32(v105, v111); + float32x2_t v113 = vsub_f32(v105, v111); + float32x2_t v126 = vadd_f32(v119, v125); + float32x2_t v127 = vsub_f32(v119, v125); + float32x2_t v140 = vadd_f32(v133, v139); + float32x2_t v141 = vsub_f32(v133, v139); + float32x2_t v142 = vadd_f32(v42, v140); + float32x2_t v143 = vsub_f32(v42, v140); + float32x2_t v144 = vadd_f32(v126, v56); + float32x2_t v145 = vsub_f32(v126, v56); + float32x2_t v146 = vadd_f32(v70, v112); + float32x2_t v147 = vsub_f32(v70, v112); + float32x2_t v148 = vadd_f32(v84, v98); + float32x2_t v149 = vsub_f32(v84, v98); + float32x2_t v246 = vadd_f32(v43, v141); + float32x2_t v247 = vsub_f32(v43, v141); + float32x2_t v248 = vadd_f32(v127, v57); + float32x2_t v249 = vsub_f32(v127, v57); + float32x2_t v250 = vadd_f32(v71, v113); + float32x2_t v251 = vsub_f32(v71, v113); + float32x2_t v252 = vadd_f32(v85, v99); + float32x2_t v253 = vsub_f32(v85, v99); + float32x2_t v150 = vadd_f32(v142, v144); + float32x2_t v154 = vadd_f32(v143, v145); + float32x2_t v156 = vsub_f32(v142, v144); + float32x2_t v157 = vsub_f32(v144, v148); + float32x2_t v158 = vsub_f32(v148, v142); + float32x2_t v159 = vsub_f32(v143, v145); + float32x2_t v160 = vsub_f32(v145, v149); + float32x2_t v161 = vsub_f32(v149, v143); + float32x2_t v180 = vmul_f32(v146, v283); + float32x2_t v186 = vrev64_f32(v147); + float32x2_t v254 = vadd_f32(v246, v248); + float32x2_t v258 = vadd_f32(v247, v249); + float32x2_t v260 = vsub_f32(v246, v248); + float32x2_t v261 = vsub_f32(v248, v252); + float32x2_t v262 = vsub_f32(v252, v246); + float32x2_t v263 = vsub_f32(v247, v249); + float32x2_t v264 = vsub_f32(v249, v253); + float32x2_t v265 = vsub_f32(v253, v247); + float32x2_t v284 = vmul_f32(v250, v283); + float32x2_t v290 = vrev64_f32(v251); + float32x2_t v151 = vadd_f32(v150, v148); + float32x2_t v155 = vadd_f32(v154, v149); + float32x2_t v187 = vmul_f32(v186, v289); + float32x2_t v191 = vmul_f32(v156, v294); + float32x2_t v195 = vmul_f32(v157, v298); + float32x2_t v199 = vmul_f32(v158, v302); + float32x2_t v205 = vrev64_f32(v159); + float32x2_t v212 = vrev64_f32(v160); + float32x2_t v219 = vrev64_f32(v161); + float32x2_t v255 = vadd_f32(v254, v252); + float32x2_t v259 = vadd_f32(v258, v253); + float32x2_t v291 = vmul_f32(v290, v289); + float32x2_t v295 = vmul_f32(v260, v294); + float32x2_t v299 = vmul_f32(v261, v298); + float32x2_t v303 = vmul_f32(v262, v302); + float32x2_t v309 = vrev64_f32(v263); + float32x2_t v316 = vrev64_f32(v264); + float32x2_t v323 = vrev64_f32(v265); + float32x2_t v152 = vadd_f32(v151, v146); + float32x2_t v169 = vmul_f32(v151, v272); + float32x2_t v175 = vrev64_f32(v155); + float32x2_t v206 = vmul_f32(v205, v308); + float32x2_t v213 = vmul_f32(v212, v315); + float32x2_t v220 = vmul_f32(v219, v322); + float32x2_t v256 = vadd_f32(v255, v250); + float32x2_t v273 = vmul_f32(v255, v272); + float32x2_t v279 = vrev64_f32(v259); + float32x2_t v310 = vmul_f32(v309, v308); + float32x2_t v317 = vmul_f32(v316, v315); + float32x2_t v324 = vmul_f32(v323, v322); + float32x2_t v153 = vadd_f32(v152, v28); + float32x2_t v176 = vmul_f32(v175, v289); + float32x2_t v221 = vadd_f32(v169, v169); + float32x2_t v234 = vadd_f32(v187, v206); + float32x2_t v236 = vsub_f32(v187, v213); + float32x2_t v238 = vsub_f32(v187, v206); + float32x2_t v257 = vadd_f32(v256, v29); + float32x2_t v280 = vmul_f32(v279, v289); + float32x2_t v325 = vadd_f32(v273, v273); + float32x2_t v338 = vadd_f32(v291, v310); + float32x2_t v340 = vsub_f32(v291, v317); + float32x2_t v342 = vsub_f32(v291, v310); + float32x2_t v222 = vadd_f32(v221, v169); + float32x2_t v226 = vadd_f32(v153, v180); + float32x2_t v235 = vadd_f32(v234, v213); + float32x2_t v237 = vadd_f32(v236, v220); + float32x2_t v239 = vsub_f32(v238, v220); + float32x2_t v326 = vadd_f32(v325, v273); + float32x2_t v330 = vadd_f32(v257, v284); + float32x2_t v339 = vadd_f32(v338, v317); + float32x2_t v341 = vadd_f32(v340, v324); + float32x2_t v343 = vsub_f32(v342, v324); + v6[0] = v153; + v6[ostride * 9] = v257; + float32x2_t v223 = vadd_f32(v153, v222); + float32x2_t v227 = vadd_f32(v226, v221); + float32x2_t v327 = vadd_f32(v257, v326); + float32x2_t v331 = vadd_f32(v330, v325); + float32x2_t v224 = vadd_f32(v223, v176); + float32x2_t v225 = vsub_f32(v223, v176); + float32x2_t v228 = vadd_f32(v227, v191); + float32x2_t v230 = vsub_f32(v227, v195); + float32x2_t v232 = vsub_f32(v227, v191); + float32x2_t v328 = vadd_f32(v327, v280); + float32x2_t v329 = vsub_f32(v327, v280); + float32x2_t v332 = vadd_f32(v331, v295); + float32x2_t v334 = vsub_f32(v331, v299); + float32x2_t v336 = vsub_f32(v331, v295); + float32x2_t v229 = vadd_f32(v228, v195); + float32x2_t v231 = vadd_f32(v230, v199); + float32x2_t v233 = vsub_f32(v232, v199); + float32x2_t v333 = vadd_f32(v332, v299); + float32x2_t v335 = vadd_f32(v334, v303); + float32x2_t v337 = vsub_f32(v336, v303); + v6[ostride * 12] = v225; + v6[ostride * 3] = v329; + v6[ostride * 6] = v224; + v6[ostride * 15] = v328; + float32x2_t v240 = vadd_f32(v229, v235); + float32x2_t v241 = vsub_f32(v229, v235); + float32x2_t v242 = vadd_f32(v231, v237); + float32x2_t v243 = vsub_f32(v231, v237); + float32x2_t v244 = vadd_f32(v233, v239); + float32x2_t v245 = vsub_f32(v233, v239); + float32x2_t v344 = vadd_f32(v333, v339); + float32x2_t v345 = vsub_f32(v333, v339); + float32x2_t v346 = vadd_f32(v335, v341); + float32x2_t v347 = vsub_f32(v335, v341); + float32x2_t v348 = vadd_f32(v337, v343); + float32x2_t v349 = vsub_f32(v337, v343); + v6[ostride * 10] = v241; + v6[ostride] = v345; + v6[ostride * 2] = v242; + v6[ostride * 11] = v346; + v6[ostride * 4] = v245; + v6[ostride * 13] = v349; + v6[ostride * 14] = v244; + v6[ostride * 5] = v348; + v6[ostride * 16] = v243; + v6[ostride * 7] = v347; + v6[ostride * 8] = v240; + v6[ostride * 17] = v344; + v5 += 1 * idist; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu18(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v316 = -5.0000000000000000e-01F; + float v328 = -1.4999999999999998e+00F; + float v333 = -8.6602540378443871e-01F; + float v340 = 7.6604444311897801e-01F; + float v345 = 9.3969262078590832e-01F; + float v350 = -1.7364817766693039e-01F; + float v355 = -6.4278760968653925e-01F; + float v362 = 3.4202014332566888e-01F; + float v369 = -9.8480775301220802e-01F; + const int32_t *v632 = &v5[v0]; + float32x2_t *v745 = &v6[v2]; + int64_t v27 = v0 * 9; + int64_t v37 = v0 * 2; + int64_t v45 = v0 * 11; + int64_t v55 = v0 * 4; + int64_t v63 = v0 * 13; + int64_t v73 = v0 * 6; + int64_t v81 = v0 * 15; + int64_t v91 = v0 * 8; + int64_t v99 = v0 * 17; + int64_t v109 = v0 * 10; + int64_t v127 = v0 * 12; + int64_t v135 = v0 * 3; + int64_t v145 = v0 * 14; + int64_t v153 = v0 * 5; + int64_t v163 = v0 * 16; + int64_t v171 = v0 * 7; + float v336 = v4 * v333; + float v358 = v4 * v355; + float v365 = v4 * v362; + float v372 = v4 * v369; + int64_t v408 = v2 * 9; + int64_t v415 = v2 * 10; + int64_t v429 = v2 * 2; + int64_t v436 = v2 * 11; + int64_t v443 = v2 * 12; + int64_t v450 = v2 * 3; + int64_t v457 = v2 * 4; + int64_t v464 = v2 * 13; + int64_t v471 = v2 * 14; + int64_t v478 = v2 * 5; + int64_t v485 = v2 * 6; + int64_t v492 = v2 * 15; + int64_t v499 = v2 * 16; + int64_t v506 = v2 * 7; + int64_t v513 = v2 * 8; + int64_t v520 = v2 * 17; + const int32_t *v533 = &v5[0]; + svint64_t v687 = svindex_s64(0, v1); + svfloat32_t v701 = svdup_n_f32(v316); + svfloat32_t v703 = svdup_n_f32(v328); + svfloat32_t v705 = svdup_n_f32(v340); + svfloat32_t v706 = svdup_n_f32(v345); + svfloat32_t v707 = svdup_n_f32(v350); + float32x2_t *v718 = &v6[0]; + svint16_t v535 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v533), v687)); + const int32_t *v542 = &v5[v27]; + const int32_t *v551 = &v5[v37]; + const int32_t *v560 = &v5[v45]; + const int32_t *v569 = &v5[v55]; + const int32_t *v578 = &v5[v63]; + const int32_t *v587 = &v5[v73]; + const int32_t *v596 = &v5[v81]; + const int32_t *v605 = &v5[v91]; + const int32_t *v614 = &v5[v99]; + const int32_t *v623 = &v5[v109]; + svint16_t v634 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v632), v687)); + const int32_t *v641 = &v5[v127]; + const int32_t *v650 = &v5[v135]; + const int32_t *v659 = &v5[v145]; + const int32_t *v668 = &v5[v153]; + const int32_t *v677 = &v5[v163]; + const int32_t *v686 = &v5[v171]; + svfloat32_t v704 = svdup_n_f32(v336); + svfloat32_t v708 = svdup_n_f32(v358); + svfloat32_t v709 = svdup_n_f32(v365); + svfloat32_t v710 = svdup_n_f32(v372); + float32x2_t *v727 = &v6[v408]; + float32x2_t *v736 = &v6[v415]; + float32x2_t *v754 = &v6[v429]; + float32x2_t *v763 = &v6[v436]; + float32x2_t *v772 = &v6[v443]; + float32x2_t *v781 = &v6[v450]; + float32x2_t *v790 = &v6[v457]; + float32x2_t *v799 = &v6[v464]; + float32x2_t *v808 = &v6[v471]; + float32x2_t *v817 = &v6[v478]; + float32x2_t *v826 = &v6[v485]; + float32x2_t *v835 = &v6[v492]; + float32x2_t *v844 = &v6[v499]; + float32x2_t *v853 = &v6[v506]; + float32x2_t *v862 = &v6[v513]; + float32x2_t *v871 = &v6[v520]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v535, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v123 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v634, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svint16_t v544 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v542), v687)); + svint16_t v553 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v551), v687)); + svint16_t v562 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v560), v687)); + svint16_t v571 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v569), v687)); + svint16_t v580 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v578), v687)); + svint16_t v589 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v587), v687)); + svint16_t v598 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v596), v687)); + svint16_t v607 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v605), v687)); + svint16_t v616 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v614), v687)); + svint16_t v625 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v623), v687)); + svint16_t v643 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v641), v687)); + svint16_t v652 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v650), v687)); + svint16_t v661 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v659), v687)); + svint16_t v670 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v668), v687)); + svint16_t v679 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v677), v687)); + svint16_t v688 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v686), v687)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v544, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v553, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v51 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v562, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v61 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v571, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v69 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v580, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v79 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v589, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v87 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v598, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v97 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v607, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v105 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v616, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v115 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v625, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v133 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v643, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v141 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v652, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v151 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v661, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v159 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v670, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v169 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v679, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v177 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v688, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v70 = svadd_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v71 = svsub_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v106 = svadd_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v151, v159); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v151, v159); + svfloat32_t v178 = svadd_f32_x(svptrue_b32(), v169, v177); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v169, v177); + svfloat32_t v180 = svadd_f32_x(svptrue_b32(), v52, v178); + svfloat32_t v181 = svsub_f32_x(svptrue_b32(), v52, v178); + svfloat32_t v182 = svadd_f32_x(svptrue_b32(), v160, v70); + svfloat32_t v183 = svsub_f32_x(svptrue_b32(), v160, v70); + svfloat32_t v184 = svadd_f32_x(svptrue_b32(), v88, v142); + svfloat32_t v185 = svsub_f32_x(svptrue_b32(), v88, v142); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v106, v124); + svfloat32_t v187 = svsub_f32_x(svptrue_b32(), v106, v124); + svfloat32_t v290 = svadd_f32_x(svptrue_b32(), v53, v179); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v53, v179); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v161, v71); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v161, v71); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v89, v143); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v89, v143); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v107, v125); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v107, v125); + svfloat32_t v188 = svadd_f32_x(svptrue_b32(), v180, v182); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v181, v183); + svfloat32_t v194 = svsub_f32_x(svptrue_b32(), v180, v182); + svfloat32_t v195 = svsub_f32_x(svptrue_b32(), v182, v186); + svfloat32_t v196 = svsub_f32_x(svptrue_b32(), v186, v180); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v181, v183); + svfloat32_t v198 = svsub_f32_x(svptrue_b32(), v183, v187); + svfloat32_t v199 = svsub_f32_x(svptrue_b32(), v187, v181); + svfloat32_t zero228 = svdup_n_f32(0); + svfloat32_t v228 = svcmla_f32_x(pred_full, zero228, v704, v185, 90); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v290, v292); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v291, v293); + svfloat32_t v304 = svsub_f32_x(svptrue_b32(), v290, v292); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v292, v296); + svfloat32_t v306 = svsub_f32_x(svptrue_b32(), v296, v290); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v291, v293); + svfloat32_t v308 = svsub_f32_x(svptrue_b32(), v293, v297); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v297, v291); + svfloat32_t zero338 = svdup_n_f32(0); + svfloat32_t v338 = svcmla_f32_x(pred_full, zero338, v704, v295, 90); + svfloat32_t v189 = svadd_f32_x(svptrue_b32(), v188, v186); + svfloat32_t v193 = svadd_f32_x(svptrue_b32(), v192, v187); + svfloat32_t zero250 = svdup_n_f32(0); + svfloat32_t v250 = svcmla_f32_x(pred_full, zero250, v708, v197, 90); + svfloat32_t zero257 = svdup_n_f32(0); + svfloat32_t v257 = svcmla_f32_x(pred_full, zero257, v709, v198, 90); + svfloat32_t zero264 = svdup_n_f32(0); + svfloat32_t v264 = svcmla_f32_x(pred_full, zero264, v710, v199, 90); + svfloat32_t v299 = svadd_f32_x(svptrue_b32(), v298, v296); + svfloat32_t v303 = svadd_f32_x(svptrue_b32(), v302, v297); + svfloat32_t zero360 = svdup_n_f32(0); + svfloat32_t v360 = svcmla_f32_x(pred_full, zero360, v708, v307, 90); + svfloat32_t zero367 = svdup_n_f32(0); + svfloat32_t v367 = svcmla_f32_x(pred_full, zero367, v709, v308, 90); + svfloat32_t zero374 = svdup_n_f32(0); + svfloat32_t v374 = svcmla_f32_x(pred_full, zero374, v710, v309, 90); + svfloat32_t v190 = svadd_f32_x(svptrue_b32(), v189, v184); + svfloat32_t v209 = svmul_f32_x(svptrue_b32(), v189, v701); + svfloat32_t zero216 = svdup_n_f32(0); + svfloat32_t v216 = svcmla_f32_x(pred_full, zero216, v704, v193, 90); + svfloat32_t v278 = svadd_f32_x(svptrue_b32(), v228, v250); + svfloat32_t v280 = svsub_f32_x(svptrue_b32(), v228, v257); + svfloat32_t v282 = svsub_f32_x(svptrue_b32(), v228, v250); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v299, v294); + svfloat32_t v319 = svmul_f32_x(svptrue_b32(), v299, v701); + svfloat32_t zero326 = svdup_n_f32(0); + svfloat32_t v326 = svcmla_f32_x(pred_full, zero326, v704, v303, 90); + svfloat32_t v388 = svadd_f32_x(svptrue_b32(), v338, v360); + svfloat32_t v390 = svsub_f32_x(svptrue_b32(), v338, v367); + svfloat32_t v392 = svsub_f32_x(svptrue_b32(), v338, v360); + svfloat32_t v191 = svadd_f32_x(svptrue_b32(), v190, v34); + svfloat32_t v265 = svadd_f32_x(svptrue_b32(), v209, v209); + svfloat32_t v279 = svadd_f32_x(svptrue_b32(), v278, v257); + svfloat32_t v281 = svadd_f32_x(svptrue_b32(), v280, v264); + svfloat32_t v283 = svsub_f32_x(svptrue_b32(), v282, v264); + svfloat32_t v301 = svadd_f32_x(svptrue_b32(), v300, v35); + svfloat32_t v375 = svadd_f32_x(svptrue_b32(), v319, v319); + svfloat32_t v389 = svadd_f32_x(svptrue_b32(), v388, v367); + svfloat32_t v391 = svadd_f32_x(svptrue_b32(), v390, v374); + svfloat32_t v393 = svsub_f32_x(svptrue_b32(), v392, v374); + svfloat32_t v266 = svmla_f32_x(pred_full, v265, v189, v701); + svfloat32_t v270 = svmla_f32_x(pred_full, v191, v184, v703); + svfloat32_t v376 = svmla_f32_x(pred_full, v375, v299, v701); + svfloat32_t v380 = svmla_f32_x(pred_full, v301, v294, v703); + svst1_f64(pred_full, (double *)(v718), svreinterpret_f64_f32(v191)); + svst1_f64(pred_full, (double *)(v727), svreinterpret_f64_f32(v301)); + svfloat32_t v267 = svadd_f32_x(svptrue_b32(), v191, v266); + svfloat32_t v271 = svadd_f32_x(svptrue_b32(), v270, v265); + svfloat32_t v377 = svadd_f32_x(svptrue_b32(), v301, v376); + svfloat32_t v381 = svadd_f32_x(svptrue_b32(), v380, v375); + svfloat32_t v268 = svadd_f32_x(svptrue_b32(), v267, v216); + svfloat32_t v269 = svsub_f32_x(svptrue_b32(), v267, v216); + svfloat32_t v272 = svmla_f32_x(pred_full, v271, v194, v705); + svfloat32_t v274 = svmls_f32_x(pred_full, v271, v195, v706); + svfloat32_t v276 = svmls_f32_x(pred_full, v271, v194, v705); + svfloat32_t v378 = svadd_f32_x(svptrue_b32(), v377, v326); + svfloat32_t v379 = svsub_f32_x(svptrue_b32(), v377, v326); + svfloat32_t v382 = svmla_f32_x(pred_full, v381, v304, v705); + svfloat32_t v384 = svmls_f32_x(pred_full, v381, v305, v706); + svfloat32_t v386 = svmls_f32_x(pred_full, v381, v304, v705); + svfloat32_t v273 = svmla_f32_x(pred_full, v272, v195, v706); + svfloat32_t v275 = svmla_f32_x(pred_full, v274, v196, v707); + svfloat32_t v277 = svmls_f32_x(pred_full, v276, v196, v707); + svfloat32_t v383 = svmla_f32_x(pred_full, v382, v305, v706); + svfloat32_t v385 = svmla_f32_x(pred_full, v384, v306, v707); + svfloat32_t v387 = svmls_f32_x(pred_full, v386, v306, v707); + svst1_f64(pred_full, (double *)(v772), svreinterpret_f64_f32(v269)); + svst1_f64(pred_full, (double *)(v781), svreinterpret_f64_f32(v379)); + svst1_f64(pred_full, (double *)(v826), svreinterpret_f64_f32(v268)); + svst1_f64(pred_full, (double *)(v835), svreinterpret_f64_f32(v378)); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v273, v279); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v273, v279); + svfloat32_t v286 = svadd_f32_x(svptrue_b32(), v275, v281); + svfloat32_t v287 = svsub_f32_x(svptrue_b32(), v275, v281); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v277, v283); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v277, v283); + svfloat32_t v394 = svadd_f32_x(svptrue_b32(), v383, v389); + svfloat32_t v395 = svsub_f32_x(svptrue_b32(), v383, v389); + svfloat32_t v396 = svadd_f32_x(svptrue_b32(), v385, v391); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v385, v391); + svfloat32_t v398 = svadd_f32_x(svptrue_b32(), v387, v393); + svfloat32_t v399 = svsub_f32_x(svptrue_b32(), v387, v393); + svst1_f64(pred_full, (double *)(v736), svreinterpret_f64_f32(v285)); + svst1_f64(pred_full, (double *)(v745), svreinterpret_f64_f32(v395)); + svst1_f64(pred_full, (double *)(v754), svreinterpret_f64_f32(v286)); + svst1_f64(pred_full, (double *)(v763), svreinterpret_f64_f32(v396)); + svst1_f64(pred_full, (double *)(v790), svreinterpret_f64_f32(v289)); + svst1_f64(pred_full, (double *)(v799), svreinterpret_f64_f32(v399)); + svst1_f64(pred_full, (double *)(v808), svreinterpret_f64_f32(v288)); + svst1_f64(pred_full, (double *)(v817), svreinterpret_f64_f32(v398)); + svst1_f64(pred_full, (double *)(v844), svreinterpret_f64_f32(v287)); + svst1_f64(pred_full, (double *)(v853), svreinterpret_f64_f32(v397)); + svst1_f64(pred_full, (double *)(v862), svreinterpret_f64_f32(v284)); + svst1_f64(pred_full, (double *)(v871), svreinterpret_f64_f32(v394)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu19(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v20 = vld1s_s16(&v5[istride]); + float v218 = -1.0555555555555556e+00F; + float v222 = 1.7752228513927079e-01F; + float v226 = -1.2820077502191529e-01F; + float v230 = 4.9321510117355499e-02F; + float v234 = 5.7611011491005903e-01F; + float v238 = -7.4996449655536279e-01F; + float v242 = -1.7385438164530381e-01F; + float v246 = -2.1729997561977314e+00F; + float v250 = -1.7021211726914738e+00F; + float v254 = 4.7087858350625778e-01F; + float v258 = -2.0239400846888440e+00F; + float v262 = 1.0551641201664090e-01F; + float v266 = 2.1294564967054850e+00F; + float v270 = -7.5087543897371167e-01F; + float v274 = 1.4812817695157160e-01F; + float v278 = 8.9900361592528333e-01F; + float v282 = -6.2148246772602778e-01F; + float v286 = -7.9869352098712687e-01F; + float v290 = -4.7339199623771833e-01F; + float v293 = -2.4216105241892630e-01F; + float v294 = 2.4216105241892630e-01F; + float v300 = -5.9368607967505101e-02F; + float v301 = 5.9368607967505101e-02F; + float v307 = 1.2578688255176201e-02F; + float v308 = -1.2578688255176201e-02F; + float v314 = -4.6789919712328903e-02F; + float v315 = 4.6789919712328903e-02F; + float v321 = -9.3750121913782358e-01F; + float v322 = 9.3750121913782358e-01F; + float v328 = -5.0111537043352902e-02F; + float v329 = 5.0111537043352902e-02F; + float v335 = -9.8761275618117661e-01F; + float v336 = 9.8761275618117661e-01F; + float v342 = -1.1745786501205959e+00F; + float v343 = 1.1745786501205959e+00F; + float v349 = 1.1114482296234993e+00F; + float v350 = -1.1114482296234993e+00F; + float v356 = 2.2860268797440955e+00F; + float v357 = -2.2860268797440955e+00F; + float v363 = 2.6420523257930939e-01F; + float v364 = -2.6420523257930939e-01F; + float v370 = 2.1981792779352136e+00F; + float v371 = -2.1981792779352136e+00F; + float v377 = 1.9339740453559042e+00F; + float v378 = -1.9339740453559042e+00F; + float v384 = -7.4825847091254893e-01F; + float v385 = 7.4825847091254893e-01F; + float v391 = -4.7820835642768872e-01F; + float v392 = 4.7820835642768872e-01F; + float v398 = 2.7005011448486022e-01F; + float v399 = -2.7005011448486022e-01F; + float v405 = -3.4642356159542270e-01F; + float v406 = 3.4642356159542270e-01F; + float v412 = -8.3485429360688279e-01F; + float v413 = 8.3485429360688279e-01F; + float v419 = -3.9375928506743518e-01F; + float v420 = 3.9375928506743518e-01F; + float32x2_t v422 = (float32x2_t){v4, v4}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v162 = vld1s_s16(&v5[0]); + float32x2_t v219 = (float32x2_t){v218, v218}; + float32x2_t v223 = (float32x2_t){v222, v222}; + float32x2_t v227 = (float32x2_t){v226, v226}; + float32x2_t v231 = (float32x2_t){v230, v230}; + float32x2_t v235 = (float32x2_t){v234, v234}; + float32x2_t v239 = (float32x2_t){v238, v238}; + float32x2_t v243 = (float32x2_t){v242, v242}; + float32x2_t v247 = (float32x2_t){v246, v246}; + float32x2_t v251 = (float32x2_t){v250, v250}; + float32x2_t v255 = (float32x2_t){v254, v254}; + float32x2_t v259 = (float32x2_t){v258, v258}; + float32x2_t v263 = (float32x2_t){v262, v262}; + float32x2_t v267 = (float32x2_t){v266, v266}; + float32x2_t v271 = (float32x2_t){v270, v270}; + float32x2_t v275 = (float32x2_t){v274, v274}; + float32x2_t v279 = (float32x2_t){v278, v278}; + float32x2_t v283 = (float32x2_t){v282, v282}; + float32x2_t v287 = (float32x2_t){v286, v286}; + float32x2_t v291 = (float32x2_t){v290, v290}; + float32x2_t v295 = (float32x2_t){v293, v294}; + float32x2_t v302 = (float32x2_t){v300, v301}; + float32x2_t v309 = (float32x2_t){v307, v308}; + float32x2_t v316 = (float32x2_t){v314, v315}; + float32x2_t v323 = (float32x2_t){v321, v322}; + float32x2_t v330 = (float32x2_t){v328, v329}; + float32x2_t v337 = (float32x2_t){v335, v336}; + float32x2_t v344 = (float32x2_t){v342, v343}; + float32x2_t v351 = (float32x2_t){v349, v350}; + float32x2_t v358 = (float32x2_t){v356, v357}; + float32x2_t v365 = (float32x2_t){v363, v364}; + float32x2_t v372 = (float32x2_t){v370, v371}; + float32x2_t v379 = (float32x2_t){v377, v378}; + float32x2_t v386 = (float32x2_t){v384, v385}; + float32x2_t v393 = (float32x2_t){v391, v392}; + float32x2_t v400 = (float32x2_t){v398, v399}; + float32x2_t v407 = (float32x2_t){v405, v406}; + float32x2_t v414 = (float32x2_t){v412, v413}; + float32x2_t v421 = (float32x2_t){v419, v420}; + int16x4_t v26 = vld1s_s16(&v5[istride * 18]); + int16x4_t v34 = vld1s_s16(&v5[istride * 2]); + int16x4_t v40 = vld1s_s16(&v5[istride * 17]); + int16x4_t v48 = vld1s_s16(&v5[istride * 4]); + int16x4_t v54 = vld1s_s16(&v5[istride * 15]); + int16x4_t v62 = vld1s_s16(&v5[istride * 8]); + int16x4_t v68 = vld1s_s16(&v5[istride * 11]); + int16x4_t v76 = vld1s_s16(&v5[istride * 16]); + int16x4_t v82 = vld1s_s16(&v5[istride * 3]); + int16x4_t v90 = vld1s_s16(&v5[istride * 13]); + int16x4_t v96 = vld1s_s16(&v5[istride * 6]); + int16x4_t v104 = vld1s_s16(&v5[istride * 7]); + int16x4_t v110 = vld1s_s16(&v5[istride * 12]); + int16x4_t v118 = vld1s_s16(&v5[istride * 14]); + int16x4_t v124 = vld1s_s16(&v5[istride * 5]); + int16x4_t v132 = vld1s_s16(&v5[istride * 9]); + int16x4_t v138 = vld1s_s16(&v5[istride * 10]); + float32x2_t v163 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v162)), 15); + float32x2_t v297 = vmul_f32(v422, v295); + float32x2_t v304 = vmul_f32(v422, v302); + float32x2_t v311 = vmul_f32(v422, v309); + float32x2_t v318 = vmul_f32(v422, v316); + float32x2_t v325 = vmul_f32(v422, v323); + float32x2_t v332 = vmul_f32(v422, v330); + float32x2_t v339 = vmul_f32(v422, v337); + float32x2_t v346 = vmul_f32(v422, v344); + float32x2_t v353 = vmul_f32(v422, v351); + float32x2_t v360 = vmul_f32(v422, v358); + float32x2_t v367 = vmul_f32(v422, v365); + float32x2_t v374 = vmul_f32(v422, v372); + float32x2_t v381 = vmul_f32(v422, v379); + float32x2_t v388 = vmul_f32(v422, v386); + float32x2_t v395 = vmul_f32(v422, v393); + float32x2_t v402 = vmul_f32(v422, v400); + float32x2_t v409 = vmul_f32(v422, v407); + float32x2_t v416 = vmul_f32(v422, v414); + float32x2_t v423 = vmul_f32(v422, v421); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v49 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v48)), 15); + float32x2_t v55 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v54)), 15); + float32x2_t v63 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v62)), 15); + float32x2_t v69 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v68)), 15); + float32x2_t v77 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v76)), 15); + float32x2_t v83 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v82)), 15); + float32x2_t v91 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v90)), 15); + float32x2_t v97 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v96)), 15); + float32x2_t v105 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v104)), 15); + float32x2_t v111 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v110)), 15); + float32x2_t v119 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v118)), 15); + float32x2_t v125 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v124)), 15); + float32x2_t v133 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v132)), 15); + float32x2_t v139 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v138)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v41, v35); + float32x2_t v56 = vadd_f32(v49, v55); + float32x2_t v57 = vsub_f32(v49, v55); + float32x2_t v70 = vadd_f32(v63, v69); + float32x2_t v71 = vsub_f32(v69, v63); + float32x2_t v84 = vadd_f32(v77, v83); + float32x2_t v85 = vsub_f32(v77, v83); + float32x2_t v98 = vadd_f32(v91, v97); + float32x2_t v99 = vsub_f32(v97, v91); + float32x2_t v112 = vadd_f32(v105, v111); + float32x2_t v113 = vsub_f32(v105, v111); + float32x2_t v126 = vadd_f32(v119, v125); + float32x2_t v127 = vsub_f32(v125, v119); + float32x2_t v140 = vadd_f32(v133, v139); + float32x2_t v141 = vsub_f32(v133, v139); + float32x2_t v142 = vsub_f32(v28, v112); + float32x2_t v143 = vsub_f32(v42, v126); + float32x2_t v144 = vsub_f32(v56, v140); + float32x2_t v145 = vsub_f32(v70, v112); + float32x2_t v146 = vsub_f32(v84, v126); + float32x2_t v147 = vsub_f32(v98, v140); + float32x2_t v148 = vadd_f32(v28, v70); + float32x2_t v150 = vadd_f32(v42, v84); + float32x2_t v152 = vadd_f32(v56, v98); + float32x2_t v181 = vsub_f32(v29, v113); + float32x2_t v182 = vsub_f32(v43, v127); + float32x2_t v183 = vsub_f32(v57, v141); + float32x2_t v184 = vsub_f32(v71, v113); + float32x2_t v185 = vsub_f32(v85, v127); + float32x2_t v186 = vsub_f32(v99, v141); + float32x2_t v187 = vadd_f32(v29, v71); + float32x2_t v189 = vadd_f32(v43, v85); + float32x2_t v191 = vadd_f32(v57, v99); + float32x2_t v149 = vadd_f32(v148, v112); + float32x2_t v151 = vadd_f32(v150, v126); + float32x2_t v153 = vadd_f32(v152, v140); + float32x2_t v154 = vadd_f32(v142, v144); + float32x2_t v155 = vadd_f32(v145, v147); + float32x2_t v171 = vsub_f32(v142, v145); + float32x2_t v172 = vsub_f32(v144, v147); + float32x2_t v188 = vadd_f32(v187, v113); + float32x2_t v190 = vadd_f32(v189, v127); + float32x2_t v192 = vadd_f32(v191, v141); + float32x2_t v193 = vadd_f32(v181, v183); + float32x2_t v194 = vadd_f32(v184, v186); + float32x2_t v203 = vsub_f32(v181, v184); + float32x2_t v204 = vsub_f32(v183, v186); + float32x2_t v248 = vmul_f32(v145, v247); + float32x2_t v260 = vmul_f32(v147, v259); + float32x2_t v268 = vmul_f32(v144, v267); + float32x2_t v347 = vrev64_f32(v184); + float32x2_t v361 = vrev64_f32(v181); + float32x2_t v368 = vrev64_f32(v186); + float32x2_t v382 = vrev64_f32(v183); + float32x2_t v156 = vadd_f32(v149, v151); + float32x2_t v165 = vadd_f32(v155, v146); + float32x2_t v166 = vadd_f32(v154, v143); + float32x2_t v168 = vsub_f32(v155, v146); + float32x2_t v169 = vsub_f32(v154, v143); + float32x2_t v173 = vsub_f32(v142, v172); + float32x2_t v175 = vadd_f32(v171, v147); + float32x2_t v178 = vsub_f32(v149, v153); + float32x2_t v179 = vsub_f32(v151, v153); + float32x2_t v195 = vadd_f32(v188, v190); + float32x2_t v197 = vadd_f32(v194, v185); + float32x2_t v198 = vadd_f32(v193, v182); + float32x2_t v200 = vsub_f32(v194, v185); + float32x2_t v201 = vsub_f32(v193, v182); + float32x2_t v205 = vsub_f32(v181, v204); + float32x2_t v207 = vadd_f32(v203, v186); + float32x2_t v210 = vsub_f32(v188, v192); + float32x2_t v211 = vsub_f32(v190, v192); + float32x2_t v252 = vmul_f32(v171, v251); + float32x2_t v264 = vmul_f32(v172, v263); + float32x2_t v348 = vmul_f32(v347, v346); + float32x2_t v354 = vrev64_f32(v203); + float32x2_t v369 = vmul_f32(v368, v367); + float32x2_t v375 = vrev64_f32(v204); + float32x2_t v383 = vmul_f32(v382, v381); + float32x2_t v157 = vadd_f32(v156, v153); + float32x2_t v167 = vsub_f32(v166, v165); + float32x2_t v170 = vsub_f32(v169, v168); + float32x2_t v174 = vsub_f32(v173, v146); + float32x2_t v176 = vsub_f32(v175, v143); + float32x2_t v180 = vadd_f32(v178, v179); + float32x2_t v196 = vadd_f32(v195, v192); + float32x2_t v199 = vsub_f32(v198, v197); + float32x2_t v202 = vsub_f32(v201, v200); + float32x2_t v206 = vsub_f32(v205, v185); + float32x2_t v208 = vsub_f32(v207, v182); + float32x2_t v212 = vadd_f32(v210, v211); + float32x2_t v224 = vmul_f32(v165, v223); + float32x2_t v228 = vmul_f32(v166, v227); + float32x2_t v236 = vmul_f32(v168, v235); + float32x2_t v240 = vmul_f32(v169, v239); + float32x2_t v284 = vmul_f32(v178, v283); + float32x2_t v288 = vmul_f32(v179, v287); + float32x2_t v305 = vrev64_f32(v197); + float32x2_t v312 = vrev64_f32(v198); + float32x2_t v326 = vrev64_f32(v200); + float32x2_t v333 = vrev64_f32(v201); + float32x2_t v355 = vmul_f32(v354, v353); + float32x2_t v376 = vmul_f32(v375, v374); + float32x2_t v410 = vrev64_f32(v210); + float32x2_t v417 = vrev64_f32(v211); + float32x2_t v164 = vadd_f32(v163, v157); + float32x2_t v177 = vsub_f32(v174, v176); + float32x2_t v209 = vsub_f32(v206, v208); + float32x2_t v220 = vmul_f32(v157, v219); + float32x2_t v232 = vmul_f32(v167, v231); + float32x2_t v244 = vmul_f32(v170, v243); + float32x2_t v272 = vmul_f32(v174, v271); + float32x2_t v276 = vmul_f32(v176, v275); + float32x2_t v292 = vmul_f32(v180, v291); + float32x2_t v298 = vrev64_f32(v196); + float32x2_t v306 = vmul_f32(v305, v304); + float32x2_t v313 = vmul_f32(v312, v311); + float32x2_t v319 = vrev64_f32(v199); + float32x2_t v327 = vmul_f32(v326, v325); + float32x2_t v334 = vmul_f32(v333, v332); + float32x2_t v340 = vrev64_f32(v202); + float32x2_t v389 = vrev64_f32(v206); + float32x2_t v396 = vrev64_f32(v208); + float32x2_t v411 = vmul_f32(v410, v409); + float32x2_t v418 = vmul_f32(v417, v416); + float32x2_t v424 = vrev64_f32(v212); + float32x2_t v426 = vadd_f32(v224, v228); + float32x2_t v427 = vadd_f32(v236, v240); + float32x2_t v280 = vmul_f32(v177, v279); + float32x2_t v299 = vmul_f32(v298, v297); + float32x2_t v320 = vmul_f32(v319, v318); + float32x2_t v341 = vmul_f32(v340, v339); + float32x2_t v390 = vmul_f32(v389, v388); + float32x2_t v397 = vmul_f32(v396, v395); + float32x2_t v403 = vrev64_f32(v209); + float32x2_t v425 = vmul_f32(v424, v423); + float32x2_t v429 = vadd_f32(v426, v427); + float32x2_t v430 = vadd_f32(v224, v232); + float32x2_t v431 = vadd_f32(v236, v244); + float32x2_t v448 = vsub_f32(v426, v427); + float32x2_t v450 = vsub_f32(v284, v292); + float32x2_t v451 = vsub_f32(v288, v292); + float32x2_t v452 = vadd_f32(v220, v164); + float32x2_t v457 = vadd_f32(v306, v313); + float32x2_t v458 = vadd_f32(v327, v334); + v6[0] = v164; + float32x2_t v404 = vmul_f32(v403, v402); + float32x2_t v428 = vadd_f32(v276, v280); + float32x2_t v432 = vadd_f32(v272, v280); + float32x2_t v433 = vsub_f32(v248, v429); + float32x2_t v434 = vadd_f32(v430, v431); + float32x2_t v440 = vsub_f32(v430, v431); + float32x2_t v445 = vadd_f32(v429, v268); + float32x2_t v453 = vadd_f32(v452, v450); + float32x2_t v454 = vsub_f32(v452, v450); + float32x2_t v456 = vadd_f32(v452, v451); + float32x2_t v460 = vadd_f32(v457, v458); + float32x2_t v461 = vadd_f32(v306, v320); + float32x2_t v462 = vadd_f32(v327, v341); + float32x2_t v479 = vsub_f32(v457, v458); + float32x2_t v481 = vsub_f32(v411, v425); + float32x2_t v482 = vsub_f32(v418, v425); + float32x2_t v435 = vsub_f32(v260, v432); + float32x2_t v436 = vadd_f32(v252, v428); + float32x2_t v438 = vadd_f32(v434, v264); + float32x2_t v441 = vadd_f32(v440, v428); + float32x2_t v442 = vadd_f32(v433, v434); + float32x2_t v449 = vadd_f32(v448, v432); + float32x2_t v455 = vsub_f32(v454, v451); + float32x2_t v459 = vadd_f32(v397, v404); + float32x2_t v463 = vadd_f32(v390, v404); + float32x2_t v464 = vsub_f32(v348, v460); + float32x2_t v465 = vadd_f32(v461, v462); + float32x2_t v471 = vsub_f32(v461, v462); + float32x2_t v476 = vadd_f32(v460, v383); + float32x2_t v483 = vadd_f32(v299, v481); + float32x2_t v484 = vsub_f32(v299, v481); + float32x2_t v486 = vadd_f32(v299, v482); + float32x2_t v437 = vadd_f32(v436, v433); + float32x2_t v439 = vadd_f32(v438, v435); + float32x2_t v443 = vfma_f32(v442, v142, v255); + float32x2_t v446 = vadd_f32(v445, v435); + float32x2_t v466 = vsub_f32(v369, v463); + float32x2_t v467 = vadd_f32(v355, v459); + float32x2_t v469 = vadd_f32(v465, v376); + float32x2_t v472 = vadd_f32(v471, v459); + float32x2_t v473 = vadd_f32(v464, v465); + float32x2_t v480 = vadd_f32(v479, v463); + float32x2_t v485 = vsub_f32(v484, v482); + float32x2_t v491 = vsub_f32(v449, v441); + float32x2_t v495 = vsub_f32(v456, v449); + float32x2_t v498 = vadd_f32(v441, v456); + float32x2_t v444 = vadd_f32(v443, v432); + float32x2_t v447 = vadd_f32(v446, v428); + float32x2_t v468 = vadd_f32(v467, v464); + float32x2_t v470 = vadd_f32(v469, v466); + float32x2_t v474 = vfma_f32(v473, v361, v360); + float32x2_t v477 = vadd_f32(v476, v466); + float32x2_t v492 = vadd_f32(v491, v456); + float32x2_t v496 = vadd_f32(v437, v453); + float32x2_t v497 = vadd_f32(v439, v455); + float32x2_t v503 = vsub_f32(v480, v472); + float32x2_t v507 = vsub_f32(v480, v486); + float32x2_t v510 = vadd_f32(v472, v486); + float32x2_t v475 = vadd_f32(v474, v463); + float32x2_t v478 = vadd_f32(v477, v459); + float32x2_t v487 = vsub_f32(v444, v437); + float32x2_t v489 = vsub_f32(v447, v439); + float32x2_t v493 = vsub_f32(v453, v444); + float32x2_t v494 = vsub_f32(v455, v447); + float32x2_t v504 = vadd_f32(v503, v486); + float32x2_t v508 = vadd_f32(v468, v483); + float32x2_t v509 = vadd_f32(v470, v485); + float32x2_t v528 = vsub_f32(v498, v510); + float32x2_t v534 = vadd_f32(v498, v510); + float32x2_t v540 = vadd_f32(v495, v507); + float32x2_t v546 = vsub_f32(v495, v507); + float32x2_t v488 = vadd_f32(v487, v453); + float32x2_t v490 = vadd_f32(v489, v455); + float32x2_t v499 = vsub_f32(v475, v468); + float32x2_t v501 = vsub_f32(v478, v470); + float32x2_t v505 = vsub_f32(v483, v475); + float32x2_t v506 = vsub_f32(v485, v478); + v6[ostride * 2] = v528; + v6[ostride * 17] = v534; + v6[ostride * 3] = v540; + v6[ostride * 16] = v546; + float32x2_t v552 = vadd_f32(v497, v509); + float32x2_t v558 = vsub_f32(v497, v509); + float32x2_t v564 = vadd_f32(v492, v504); + float32x2_t v570 = vsub_f32(v492, v504); + float32x2_t v600 = vsub_f32(v496, v508); + float32x2_t v606 = vadd_f32(v496, v508); + float32x2_t v500 = vadd_f32(v499, v483); + float32x2_t v502 = vadd_f32(v501, v485); + v6[ostride * 4] = v552; + v6[ostride * 15] = v558; + v6[ostride * 5] = v564; + v6[ostride * 14] = v570; + float32x2_t v576 = vadd_f32(v494, v506); + float32x2_t v582 = vsub_f32(v494, v506); + float32x2_t v588 = vadd_f32(v493, v505); + float32x2_t v594 = vsub_f32(v493, v505); + v6[ostride * 8] = v600; + v6[ostride * 11] = v606; + float32x2_t v516 = vadd_f32(v488, v500); + float32x2_t v522 = vsub_f32(v488, v500); + v6[ostride * 6] = v576; + v6[ostride * 13] = v582; + v6[ostride * 7] = v588; + v6[ostride * 12] = v594; + float32x2_t v612 = vadd_f32(v490, v502); + float32x2_t v618 = vsub_f32(v490, v502); + v6[ostride] = v516; + v6[ostride * 18] = v522; + v6[ostride * 9] = v612; + v6[ostride * 10] = v618; + v5 += 1 * idist; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu19(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v259 = -1.0555555555555556e+00F; + float v264 = 1.7752228513927079e-01F; + float v269 = -1.2820077502191529e-01F; + float v274 = 4.9321510117355499e-02F; + float v279 = 5.7611011491005903e-01F; + float v284 = -7.4996449655536279e-01F; + float v289 = -1.7385438164530381e-01F; + float v294 = -2.1729997561977314e+00F; + float v299 = -1.7021211726914738e+00F; + float v304 = 4.7087858350625778e-01F; + float v309 = -2.0239400846888440e+00F; + float v314 = 1.0551641201664090e-01F; + float v319 = 2.1294564967054850e+00F; + float v324 = -7.5087543897371167e-01F; + float v329 = 1.4812817695157160e-01F; + float v334 = 8.9900361592528333e-01F; + float v339 = -6.2148246772602778e-01F; + float v344 = -7.9869352098712687e-01F; + float v349 = -4.7339199623771833e-01F; + float v354 = 2.4216105241892630e-01F; + float v361 = 5.9368607967505101e-02F; + float v368 = -1.2578688255176201e-02F; + float v375 = 4.6789919712328903e-02F; + float v382 = 9.3750121913782358e-01F; + float v389 = 5.0111537043352902e-02F; + float v396 = 9.8761275618117661e-01F; + float v403 = 1.1745786501205959e+00F; + float v410 = -1.1114482296234993e+00F; + float v417 = -2.2860268797440955e+00F; + float v424 = -2.6420523257930939e-01F; + float v431 = -2.1981792779352136e+00F; + float v438 = -1.9339740453559042e+00F; + float v445 = 7.4825847091254893e-01F; + float v452 = 4.7820835642768872e-01F; + float v459 = -2.7005011448486022e-01F; + float v466 = 3.4642356159542270e-01F; + float v473 = 8.3485429360688279e-01F; + float v480 = 3.9375928506743518e-01F; + const int32_t *v728 = &v5[v0]; + float32x2_t *v949 = &v6[v2]; + int64_t v27 = v0 * 18; + int64_t v37 = v0 * 2; + int64_t v45 = v0 * 17; + int64_t v55 = v0 * 4; + int64_t v63 = v0 * 15; + int64_t v73 = v0 * 8; + int64_t v81 = v0 * 11; + int64_t v91 = v0 * 16; + int64_t v99 = v0 * 3; + int64_t v109 = v0 * 13; + int64_t v117 = v0 * 6; + int64_t v127 = v0 * 7; + int64_t v135 = v0 * 12; + int64_t v145 = v0 * 14; + int64_t v153 = v0 * 5; + int64_t v163 = v0 * 9; + int64_t v171 = v0 * 10; + float v357 = v4 * v354; + float v364 = v4 * v361; + float v371 = v4 * v368; + float v378 = v4 * v375; + float v385 = v4 * v382; + float v392 = v4 * v389; + float v399 = v4 * v396; + float v406 = v4 * v403; + float v413 = v4 * v410; + float v420 = v4 * v417; + float v427 = v4 * v424; + float v434 = v4 * v431; + float v441 = v4 * v438; + float v448 = v4 * v445; + float v455 = v4 * v452; + float v462 = v4 * v459; + float v469 = v4 * v466; + float v476 = v4 * v473; + float v483 = v4 * v480; + int64_t v588 = v2 * 18; + int64_t v596 = v2 * 2; + int64_t v604 = v2 * 17; + int64_t v612 = v2 * 3; + int64_t v620 = v2 * 16; + int64_t v628 = v2 * 4; + int64_t v636 = v2 * 15; + int64_t v644 = v2 * 5; + int64_t v652 = v2 * 14; + int64_t v660 = v2 * 6; + int64_t v668 = v2 * 13; + int64_t v676 = v2 * 7; + int64_t v684 = v2 * 12; + int64_t v692 = v2 * 8; + int64_t v700 = v2 * 11; + int64_t v708 = v2 * 9; + int64_t v716 = v2 * 10; + const int32_t *v891 = &v5[0]; + svint64_t v892 = svindex_s64(0, v1); + svfloat32_t v895 = svdup_n_f32(v259); + svfloat32_t v896 = svdup_n_f32(v264); + svfloat32_t v897 = svdup_n_f32(v269); + svfloat32_t v898 = svdup_n_f32(v274); + svfloat32_t v899 = svdup_n_f32(v279); + svfloat32_t v900 = svdup_n_f32(v284); + svfloat32_t v901 = svdup_n_f32(v289); + svfloat32_t v902 = svdup_n_f32(v294); + svfloat32_t v903 = svdup_n_f32(v299); + svfloat32_t v904 = svdup_n_f32(v304); + svfloat32_t v905 = svdup_n_f32(v309); + svfloat32_t v906 = svdup_n_f32(v314); + svfloat32_t v907 = svdup_n_f32(v319); + svfloat32_t v908 = svdup_n_f32(v324); + svfloat32_t v909 = svdup_n_f32(v329); + svfloat32_t v910 = svdup_n_f32(v334); + svfloat32_t v911 = svdup_n_f32(v339); + svfloat32_t v912 = svdup_n_f32(v344); + svfloat32_t v913 = svdup_n_f32(v349); + float32x2_t *v940 = &v6[0]; + svint16_t v730 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v728), v892)); + const int32_t *v737 = &v5[v27]; + const int32_t *v746 = &v5[v37]; + const int32_t *v755 = &v5[v45]; + const int32_t *v764 = &v5[v55]; + const int32_t *v773 = &v5[v63]; + const int32_t *v782 = &v5[v73]; + const int32_t *v791 = &v5[v81]; + const int32_t *v800 = &v5[v91]; + const int32_t *v809 = &v5[v99]; + const int32_t *v818 = &v5[v109]; + const int32_t *v827 = &v5[v117]; + const int32_t *v836 = &v5[v127]; + const int32_t *v845 = &v5[v135]; + const int32_t *v854 = &v5[v145]; + const int32_t *v863 = &v5[v153]; + const int32_t *v872 = &v5[v163]; + const int32_t *v881 = &v5[v171]; + svint16_t v893 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v891), v892)); + svfloat32_t v914 = svdup_n_f32(v357); + svfloat32_t v915 = svdup_n_f32(v364); + svfloat32_t v916 = svdup_n_f32(v371); + svfloat32_t v917 = svdup_n_f32(v378); + svfloat32_t v918 = svdup_n_f32(v385); + svfloat32_t v919 = svdup_n_f32(v392); + svfloat32_t v920 = svdup_n_f32(v399); + svfloat32_t v921 = svdup_n_f32(v406); + svfloat32_t v922 = svdup_n_f32(v413); + svfloat32_t v923 = svdup_n_f32(v420); + svfloat32_t v924 = svdup_n_f32(v427); + svfloat32_t v925 = svdup_n_f32(v434); + svfloat32_t v926 = svdup_n_f32(v441); + svfloat32_t v927 = svdup_n_f32(v448); + svfloat32_t v928 = svdup_n_f32(v455); + svfloat32_t v929 = svdup_n_f32(v462); + svfloat32_t v930 = svdup_n_f32(v469); + svfloat32_t v931 = svdup_n_f32(v476); + svfloat32_t v932 = svdup_n_f32(v483); + float32x2_t *v958 = &v6[v588]; + float32x2_t *v967 = &v6[v596]; + float32x2_t *v976 = &v6[v604]; + float32x2_t *v985 = &v6[v612]; + float32x2_t *v994 = &v6[v620]; + float32x2_t *v1003 = &v6[v628]; + float32x2_t *v1012 = &v6[v636]; + float32x2_t *v1021 = &v6[v644]; + float32x2_t *v1030 = &v6[v652]; + float32x2_t *v1039 = &v6[v660]; + float32x2_t *v1048 = &v6[v668]; + float32x2_t *v1057 = &v6[v676]; + float32x2_t *v1066 = &v6[v684]; + float32x2_t *v1075 = &v6[v692]; + float32x2_t *v1084 = &v6[v700]; + float32x2_t *v1093 = &v6[v708]; + float32x2_t *v1102 = &v6[v716]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v730, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v203 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v893, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svint16_t v739 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v737), v892)); + svint16_t v748 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v746), v892)); + svint16_t v757 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v755), v892)); + svint16_t v766 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v764), v892)); + svint16_t v775 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v773), v892)); + svint16_t v784 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v782), v892)); + svint16_t v793 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v791), v892)); + svint16_t v802 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v800), v892)); + svint16_t v811 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v809), v892)); + svint16_t v820 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v818), v892)); + svint16_t v829 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v827), v892)); + svint16_t v838 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v836), v892)); + svint16_t v847 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v845), v892)); + svint16_t v856 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v854), v892)); + svint16_t v865 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v863), v892)); + svint16_t v874 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v872), v892)); + svint16_t v883 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v881), v892)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v739, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v748, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v51 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v757, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v61 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v766, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v69 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v775, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v79 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v784, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v87 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v793, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v97 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v802, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v105 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v811, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v115 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v820, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v123 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v829, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v133 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v838, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v141 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v847, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v151 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v856, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v159 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v865, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v169 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v874, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v177 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v883, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v51, v43); + svfloat32_t v70 = svadd_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v71 = svsub_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v87, v79); + svfloat32_t v106 = svadd_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v123, v115); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v151, v159); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v159, v151); + svfloat32_t v178 = svadd_f32_x(svptrue_b32(), v169, v177); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v169, v177); + svfloat32_t v180 = svsub_f32_x(svptrue_b32(), v34, v142); + svfloat32_t v181 = svsub_f32_x(svptrue_b32(), v52, v160); + svfloat32_t v182 = svsub_f32_x(svptrue_b32(), v70, v178); + svfloat32_t v183 = svsub_f32_x(svptrue_b32(), v88, v142); + svfloat32_t v184 = svsub_f32_x(svptrue_b32(), v106, v160); + svfloat32_t v185 = svsub_f32_x(svptrue_b32(), v124, v178); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v34, v88); + svfloat32_t v188 = svadd_f32_x(svptrue_b32(), v52, v106); + svfloat32_t v190 = svadd_f32_x(svptrue_b32(), v70, v124); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v35, v143); + svfloat32_t v222 = svsub_f32_x(svptrue_b32(), v53, v161); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v71, v179); + svfloat32_t v224 = svsub_f32_x(svptrue_b32(), v89, v143); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v107, v161); + svfloat32_t v226 = svsub_f32_x(svptrue_b32(), v125, v179); + svfloat32_t v227 = svadd_f32_x(svptrue_b32(), v35, v89); + svfloat32_t v229 = svadd_f32_x(svptrue_b32(), v53, v107); + svfloat32_t v231 = svadd_f32_x(svptrue_b32(), v71, v125); + svfloat32_t v187 = svadd_f32_x(svptrue_b32(), v186, v142); + svfloat32_t v189 = svadd_f32_x(svptrue_b32(), v188, v160); + svfloat32_t v191 = svadd_f32_x(svptrue_b32(), v190, v178); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v180, v182); + svfloat32_t v193 = svadd_f32_x(svptrue_b32(), v183, v185); + svfloat32_t v211 = svsub_f32_x(svptrue_b32(), v180, v183); + svfloat32_t v212 = svsub_f32_x(svptrue_b32(), v182, v185); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v227, v143); + svfloat32_t v230 = svadd_f32_x(svptrue_b32(), v229, v161); + svfloat32_t v232 = svadd_f32_x(svptrue_b32(), v231, v179); + svfloat32_t v233 = svadd_f32_x(svptrue_b32(), v221, v223); + svfloat32_t v234 = svadd_f32_x(svptrue_b32(), v224, v226); + svfloat32_t v243 = svsub_f32_x(svptrue_b32(), v221, v224); + svfloat32_t v244 = svsub_f32_x(svptrue_b32(), v223, v226); + svfloat32_t zero408 = svdup_n_f32(0); + svfloat32_t v408 = svcmla_f32_x(pred_full, zero408, v921, v224, 90); + svfloat32_t zero429 = svdup_n_f32(0); + svfloat32_t v429 = svcmla_f32_x(pred_full, zero429, v924, v226, 90); + svfloat32_t v194 = svadd_f32_x(svptrue_b32(), v187, v189); + svfloat32_t v205 = svadd_f32_x(svptrue_b32(), v193, v184); + svfloat32_t v206 = svadd_f32_x(svptrue_b32(), v192, v181); + svfloat32_t v208 = svsub_f32_x(svptrue_b32(), v193, v184); + svfloat32_t v209 = svsub_f32_x(svptrue_b32(), v192, v181); + svfloat32_t v213 = svsub_f32_x(svptrue_b32(), v180, v212); + svfloat32_t v215 = svadd_f32_x(svptrue_b32(), v211, v185); + svfloat32_t v218 = svsub_f32_x(svptrue_b32(), v187, v191); + svfloat32_t v219 = svsub_f32_x(svptrue_b32(), v189, v191); + svfloat32_t v235 = svadd_f32_x(svptrue_b32(), v228, v230); + svfloat32_t v237 = svadd_f32_x(svptrue_b32(), v234, v225); + svfloat32_t v238 = svadd_f32_x(svptrue_b32(), v233, v222); + svfloat32_t v240 = svsub_f32_x(svptrue_b32(), v234, v225); + svfloat32_t v241 = svsub_f32_x(svptrue_b32(), v233, v222); + svfloat32_t v245 = svsub_f32_x(svptrue_b32(), v221, v244); + svfloat32_t v247 = svadd_f32_x(svptrue_b32(), v243, v226); + svfloat32_t v250 = svsub_f32_x(svptrue_b32(), v228, v232); + svfloat32_t v251 = svsub_f32_x(svptrue_b32(), v230, v232); + svfloat32_t v195 = svadd_f32_x(svptrue_b32(), v194, v191); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v206, v205); + svfloat32_t v210 = svsub_f32_x(svptrue_b32(), v209, v208); + svfloat32_t v214 = svsub_f32_x(svptrue_b32(), v213, v184); + svfloat32_t v216 = svsub_f32_x(svptrue_b32(), v215, v181); + svfloat32_t v220 = svadd_f32_x(svptrue_b32(), v218, v219); + svfloat32_t v236 = svadd_f32_x(svptrue_b32(), v235, v232); + svfloat32_t v239 = svsub_f32_x(svptrue_b32(), v238, v237); + svfloat32_t v242 = svsub_f32_x(svptrue_b32(), v241, v240); + svfloat32_t v246 = svsub_f32_x(svptrue_b32(), v245, v225); + svfloat32_t v248 = svsub_f32_x(svptrue_b32(), v247, v222); + svfloat32_t v252 = svadd_f32_x(svptrue_b32(), v250, v251); + svfloat32_t v272 = svmul_f32_x(svptrue_b32(), v206, v897); + svfloat32_t v287 = svmul_f32_x(svptrue_b32(), v209, v900); + svfloat32_t zero366 = svdup_n_f32(0); + svfloat32_t v366 = svcmla_f32_x(pred_full, zero366, v915, v237, 90); + svfloat32_t zero387 = svdup_n_f32(0); + svfloat32_t v387 = svcmla_f32_x(pred_full, zero387, v918, v240, 90); + svfloat32_t zero471 = svdup_n_f32(0); + svfloat32_t v471 = svcmla_f32_x(pred_full, zero471, v930, v250, 90); + svfloat32_t zero478 = svdup_n_f32(0); + svfloat32_t v478 = svcmla_f32_x(pred_full, zero478, v931, v251, 90); + svfloat32_t v204 = svadd_f32_x(svptrue_b32(), v203, v195); + svfloat32_t v217 = svsub_f32_x(svptrue_b32(), v214, v216); + svfloat32_t v249 = svsub_f32_x(svptrue_b32(), v246, v248); + svfloat32_t v277 = svmul_f32_x(svptrue_b32(), v207, v898); + svfloat32_t v292 = svmul_f32_x(svptrue_b32(), v210, v901); + svfloat32_t v352 = svmul_f32_x(svptrue_b32(), v220, v913); + svfloat32_t zero359 = svdup_n_f32(0); + svfloat32_t v359 = svcmla_f32_x(pred_full, zero359, v914, v236, 90); + svfloat32_t zero485 = svdup_n_f32(0); + svfloat32_t v485 = svcmla_f32_x(pred_full, zero485, v932, v252, 90); + svfloat32_t v486 = svmla_f32_x(pred_full, v272, v205, v896); + svfloat32_t v487 = svmla_f32_x(pred_full, v287, v208, v899); + svfloat32_t v517 = svcmla_f32_x(pred_full, v366, v916, v238, 90); + svfloat32_t v518 = svcmla_f32_x(pred_full, v387, v919, v241, 90); + svfloat32_t v337 = svmul_f32_x(svptrue_b32(), v217, v910); + svfloat32_t zero464 = svdup_n_f32(0); + svfloat32_t v464 = svcmla_f32_x(pred_full, zero464, v929, v249, 90); + svfloat32_t v489 = svadd_f32_x(svptrue_b32(), v486, v487); + svfloat32_t v490 = svmla_f32_x(pred_full, v277, v205, v896); + svfloat32_t v491 = svmla_f32_x(pred_full, v292, v208, v899); + svfloat32_t v508 = svsub_f32_x(svptrue_b32(), v486, v487); + svfloat32_t v510 = svnmls_f32_x(pred_full, v352, v218, v911); + svfloat32_t v511 = svnmls_f32_x(pred_full, v352, v219, v912); + svfloat32_t v512 = svmla_f32_x(pred_full, v204, v195, v895); + svfloat32_t v520 = svadd_f32_x(svptrue_b32(), v517, v518); + svfloat32_t v521 = svcmla_f32_x(pred_full, v366, v917, v239, 90); + svfloat32_t v522 = svcmla_f32_x(pred_full, v387, v920, v242, 90); + svfloat32_t v539 = svsub_f32_x(svptrue_b32(), v517, v518); + svfloat32_t v541 = svsub_f32_x(svptrue_b32(), v471, v485); + svfloat32_t v542 = svsub_f32_x(svptrue_b32(), v478, v485); + svst1_f64(pred_full, (double *)(v940), svreinterpret_f64_f32(v204)); + svfloat32_t v488 = svmla_f32_x(pred_full, v337, v216, v909); + svfloat32_t v492 = svmla_f32_x(pred_full, v337, v214, v908); + svfloat32_t v493 = svnmls_f32_x(pred_full, v489, v183, v902); + svfloat32_t v494 = svadd_f32_x(svptrue_b32(), v490, v491); + svfloat32_t v500 = svsub_f32_x(svptrue_b32(), v490, v491); + svfloat32_t v505 = svmla_f32_x(pred_full, v489, v182, v907); + svfloat32_t v513 = svadd_f32_x(svptrue_b32(), v512, v510); + svfloat32_t v514 = svsub_f32_x(svptrue_b32(), v512, v510); + svfloat32_t v516 = svadd_f32_x(svptrue_b32(), v512, v511); + svfloat32_t v519 = svcmla_f32_x(pred_full, v464, v928, v248, 90); + svfloat32_t v523 = svcmla_f32_x(pred_full, v464, v927, v246, 90); + svfloat32_t v524 = svsub_f32_x(svptrue_b32(), v408, v520); + svfloat32_t v525 = svadd_f32_x(svptrue_b32(), v521, v522); + svfloat32_t v531 = svsub_f32_x(svptrue_b32(), v521, v522); + svfloat32_t v536 = svcmla_f32_x(pred_full, v520, v926, v223, 90); + svfloat32_t v543 = svadd_f32_x(svptrue_b32(), v359, v541); + svfloat32_t v544 = svsub_f32_x(svptrue_b32(), v359, v541); + svfloat32_t v546 = svadd_f32_x(svptrue_b32(), v359, v542); + svfloat32_t v495 = svnmls_f32_x(pred_full, v492, v185, v905); + svfloat32_t v496 = svmla_f32_x(pred_full, v488, v211, v903); + svfloat32_t v498 = svmla_f32_x(pred_full, v494, v212, v906); + svfloat32_t v501 = svadd_f32_x(svptrue_b32(), v500, v488); + svfloat32_t v502 = svadd_f32_x(svptrue_b32(), v493, v494); + svfloat32_t v509 = svadd_f32_x(svptrue_b32(), v508, v492); + svfloat32_t v515 = svsub_f32_x(svptrue_b32(), v514, v511); + svfloat32_t v526 = svsub_f32_x(svptrue_b32(), v429, v523); + svfloat32_t v527 = svcmla_f32_x(pred_full, v519, v922, v243, 90); + svfloat32_t v529 = svcmla_f32_x(pred_full, v525, v925, v244, 90); + svfloat32_t v532 = svadd_f32_x(svptrue_b32(), v531, v519); + svfloat32_t v533 = svadd_f32_x(svptrue_b32(), v524, v525); + svfloat32_t v540 = svadd_f32_x(svptrue_b32(), v539, v523); + svfloat32_t v545 = svsub_f32_x(svptrue_b32(), v544, v542); + svfloat32_t v497 = svadd_f32_x(svptrue_b32(), v496, v493); + svfloat32_t v499 = svadd_f32_x(svptrue_b32(), v498, v495); + svfloat32_t v503 = svmla_f32_x(pred_full, v502, v180, v904); + svfloat32_t v506 = svadd_f32_x(svptrue_b32(), v505, v495); + svfloat32_t v528 = svadd_f32_x(svptrue_b32(), v527, v524); + svfloat32_t v530 = svadd_f32_x(svptrue_b32(), v529, v526); + svfloat32_t v534 = svcmla_f32_x(pred_full, v533, v923, v221, 90); + svfloat32_t v537 = svadd_f32_x(svptrue_b32(), v536, v526); + svfloat32_t v551 = svsub_f32_x(svptrue_b32(), v509, v501); + svfloat32_t v555 = svsub_f32_x(svptrue_b32(), v516, v509); + svfloat32_t v558 = svadd_f32_x(svptrue_b32(), v501, v516); + svfloat32_t v563 = svsub_f32_x(svptrue_b32(), v540, v532); + svfloat32_t v567 = svsub_f32_x(svptrue_b32(), v540, v546); + svfloat32_t v570 = svadd_f32_x(svptrue_b32(), v532, v546); + svfloat32_t v504 = svadd_f32_x(svptrue_b32(), v503, v492); + svfloat32_t v507 = svadd_f32_x(svptrue_b32(), v506, v488); + svfloat32_t v535 = svadd_f32_x(svptrue_b32(), v534, v523); + svfloat32_t v538 = svadd_f32_x(svptrue_b32(), v537, v519); + svfloat32_t v552 = svadd_f32_x(svptrue_b32(), v551, v516); + svfloat32_t v556 = svadd_f32_x(svptrue_b32(), v497, v513); + svfloat32_t v557 = svadd_f32_x(svptrue_b32(), v499, v515); + svfloat32_t v564 = svadd_f32_x(svptrue_b32(), v563, v546); + svfloat32_t v568 = svadd_f32_x(svptrue_b32(), v528, v543); + svfloat32_t v569 = svadd_f32_x(svptrue_b32(), v530, v545); + svfloat32_t v594 = svsub_f32_x(svptrue_b32(), v558, v570); + svfloat32_t v602 = svadd_f32_x(svptrue_b32(), v558, v570); + svfloat32_t v610 = svadd_f32_x(svptrue_b32(), v555, v567); + svfloat32_t v618 = svsub_f32_x(svptrue_b32(), v555, v567); + svfloat32_t v547 = svsub_f32_x(svptrue_b32(), v504, v497); + svfloat32_t v549 = svsub_f32_x(svptrue_b32(), v507, v499); + svfloat32_t v553 = svsub_f32_x(svptrue_b32(), v513, v504); + svfloat32_t v554 = svsub_f32_x(svptrue_b32(), v515, v507); + svfloat32_t v559 = svsub_f32_x(svptrue_b32(), v535, v528); + svfloat32_t v561 = svsub_f32_x(svptrue_b32(), v538, v530); + svfloat32_t v565 = svsub_f32_x(svptrue_b32(), v543, v535); + svfloat32_t v566 = svsub_f32_x(svptrue_b32(), v545, v538); + svfloat32_t v626 = svadd_f32_x(svptrue_b32(), v557, v569); + svfloat32_t v634 = svsub_f32_x(svptrue_b32(), v557, v569); + svfloat32_t v642 = svadd_f32_x(svptrue_b32(), v552, v564); + svfloat32_t v650 = svsub_f32_x(svptrue_b32(), v552, v564); + svfloat32_t v690 = svsub_f32_x(svptrue_b32(), v556, v568); + svfloat32_t v698 = svadd_f32_x(svptrue_b32(), v556, v568); + svst1_f64(pred_full, (double *)(v967), svreinterpret_f64_f32(v594)); + svst1_f64(pred_full, (double *)(v976), svreinterpret_f64_f32(v602)); + svst1_f64(pred_full, (double *)(v985), svreinterpret_f64_f32(v610)); + svst1_f64(pred_full, (double *)(v994), svreinterpret_f64_f32(v618)); + svfloat32_t v548 = svadd_f32_x(svptrue_b32(), v547, v513); + svfloat32_t v550 = svadd_f32_x(svptrue_b32(), v549, v515); + svfloat32_t v560 = svadd_f32_x(svptrue_b32(), v559, v543); + svfloat32_t v562 = svadd_f32_x(svptrue_b32(), v561, v545); + svfloat32_t v658 = svadd_f32_x(svptrue_b32(), v554, v566); + svfloat32_t v666 = svsub_f32_x(svptrue_b32(), v554, v566); + svfloat32_t v674 = svadd_f32_x(svptrue_b32(), v553, v565); + svfloat32_t v682 = svsub_f32_x(svptrue_b32(), v553, v565); + svst1_f64(pred_full, (double *)(v1003), svreinterpret_f64_f32(v626)); + svst1_f64(pred_full, (double *)(v1012), svreinterpret_f64_f32(v634)); + svst1_f64(pred_full, (double *)(v1021), svreinterpret_f64_f32(v642)); + svst1_f64(pred_full, (double *)(v1030), svreinterpret_f64_f32(v650)); + svst1_f64(pred_full, (double *)(v1075), svreinterpret_f64_f32(v690)); + svst1_f64(pred_full, (double *)(v1084), svreinterpret_f64_f32(v698)); + svfloat32_t v578 = svadd_f32_x(svptrue_b32(), v548, v560); + svfloat32_t v586 = svsub_f32_x(svptrue_b32(), v548, v560); + svfloat32_t v706 = svadd_f32_x(svptrue_b32(), v550, v562); + svfloat32_t v714 = svsub_f32_x(svptrue_b32(), v550, v562); + svst1_f64(pred_full, (double *)(v1039), svreinterpret_f64_f32(v658)); + svst1_f64(pred_full, (double *)(v1048), svreinterpret_f64_f32(v666)); + svst1_f64(pred_full, (double *)(v1057), svreinterpret_f64_f32(v674)); + svst1_f64(pred_full, (double *)(v1066), svreinterpret_f64_f32(v682)); + svst1_f64(pred_full, (double *)(v949), svreinterpret_f64_f32(v578)); + svst1_f64(pred_full, (double *)(v958), svreinterpret_f64_f32(v586)); + svst1_f64(pred_full, (double *)(v1093), svreinterpret_f64_f32(v706)); + svst1_f64(pred_full, (double *)(v1102), svreinterpret_f64_f32(v714)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu20(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v154 = vld1s_s16(&v5[istride]); + float v286 = 1.5388417685876268e+00F; + float v293 = 5.8778525229247325e-01F; + float v300 = 3.6327126400268028e-01F; + float v324 = 1.0000000000000000e+00F; + float v325 = -1.0000000000000000e+00F; + float v331 = -1.2500000000000000e+00F; + float v332 = 1.2500000000000000e+00F; + float v338 = 5.5901699437494745e-01F; + float v339 = -5.5901699437494745e-01F; + float32x2_t v341 = (float32x2_t){v4, v4}; + float v346 = -1.5388417685876268e+00F; + float v350 = -5.8778525229247325e-01F; + float v354 = -3.6327126400268028e-01F; + int16x4_t v20 = vld1s_s16(&v5[0]); + float32x2_t v155 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v154)), 15); + float32x2_t v280 = (float32x2_t){v331, v331}; + float32x2_t v284 = (float32x2_t){v338, v338}; + float32x2_t v288 = (float32x2_t){v286, v346}; + float32x2_t v295 = (float32x2_t){v293, v350}; + float32x2_t v302 = (float32x2_t){v300, v354}; + float32x2_t v326 = (float32x2_t){v324, v325}; + float32x2_t v333 = (float32x2_t){v331, v332}; + float32x2_t v340 = (float32x2_t){v338, v339}; + float32x2_t v347 = (float32x2_t){v346, v346}; + float32x2_t v351 = (float32x2_t){v350, v350}; + float32x2_t v355 = (float32x2_t){v354, v354}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v26 = vld1s_s16(&v5[istride * 10]); + int16x4_t v34 = vld1s_s16(&v5[istride * 5]); + int16x4_t v40 = vld1s_s16(&v5[istride * 15]); + int16x4_t v50 = vld1s_s16(&v5[istride * 4]); + int16x4_t v56 = vld1s_s16(&v5[istride * 14]); + int16x4_t v64 = vld1s_s16(&v5[istride * 9]); + int16x4_t v70 = vld1s_s16(&v5[istride * 19]); + int16x4_t v80 = vld1s_s16(&v5[istride * 8]); + int16x4_t v86 = vld1s_s16(&v5[istride * 18]); + int16x4_t v94 = vld1s_s16(&v5[istride * 13]); + int16x4_t v100 = vld1s_s16(&v5[istride * 3]); + int16x4_t v110 = vld1s_s16(&v5[istride * 12]); + int16x4_t v116 = vld1s_s16(&v5[istride * 2]); + int16x4_t v124 = vld1s_s16(&v5[istride * 17]); + int16x4_t v130 = vld1s_s16(&v5[istride * 7]); + int16x4_t v140 = vld1s_s16(&v5[istride * 16]); + int16x4_t v146 = vld1s_s16(&v5[istride * 6]); + int16x4_t v160 = vld1s_s16(&v5[istride * 11]); + float32x2_t v290 = vmul_f32(v341, v288); + float32x2_t v297 = vmul_f32(v341, v295); + float32x2_t v304 = vmul_f32(v341, v302); + float32x2_t v328 = vmul_f32(v341, v326); + float32x2_t v335 = vmul_f32(v341, v333); + float32x2_t v342 = vmul_f32(v341, v340); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v51 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v50)), 15); + float32x2_t v57 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v56)), 15); + float32x2_t v65 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v64)), 15); + float32x2_t v71 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v70)), 15); + float32x2_t v81 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v80)), 15); + float32x2_t v87 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v86)), 15); + float32x2_t v95 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v94)), 15); + float32x2_t v101 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v100)), 15); + float32x2_t v111 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v110)), 15); + float32x2_t v117 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v116)), 15); + float32x2_t v125 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v124)), 15); + float32x2_t v131 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v130)), 15); + float32x2_t v141 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v140)), 15); + float32x2_t v147 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v146)), 15); + float32x2_t v161 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v160)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v58 = vadd_f32(v51, v57); + float32x2_t v59 = vsub_f32(v51, v57); + float32x2_t v72 = vadd_f32(v65, v71); + float32x2_t v73 = vsub_f32(v65, v71); + float32x2_t v88 = vadd_f32(v81, v87); + float32x2_t v89 = vsub_f32(v81, v87); + float32x2_t v102 = vadd_f32(v95, v101); + float32x2_t v103 = vsub_f32(v95, v101); + float32x2_t v118 = vadd_f32(v111, v117); + float32x2_t v119 = vsub_f32(v111, v117); + float32x2_t v132 = vadd_f32(v125, v131); + float32x2_t v133 = vsub_f32(v125, v131); + float32x2_t v148 = vadd_f32(v141, v147); + float32x2_t v149 = vsub_f32(v141, v147); + float32x2_t v162 = vadd_f32(v155, v161); + float32x2_t v163 = vsub_f32(v155, v161); + float32x2_t v44 = vadd_f32(v28, v42); + float32x2_t v45 = vsub_f32(v28, v42); + float32x2_t v74 = vadd_f32(v58, v72); + float32x2_t v75 = vsub_f32(v58, v72); + float32x2_t v104 = vadd_f32(v88, v102); + float32x2_t v105 = vsub_f32(v88, v102); + float32x2_t v134 = vadd_f32(v118, v132); + float32x2_t v135 = vsub_f32(v118, v132); + float32x2_t v164 = vadd_f32(v148, v162); + float32x2_t v165 = vsub_f32(v148, v162); + float32x2_t v266 = vadd_f32(v59, v149); + float32x2_t v267 = vsub_f32(v59, v149); + float32x2_t v268 = vadd_f32(v119, v89); + float32x2_t v269 = vsub_f32(v119, v89); + float32x2_t v316 = vadd_f32(v73, v163); + float32x2_t v317 = vsub_f32(v73, v163); + float32x2_t v318 = vadd_f32(v133, v103); + float32x2_t v319 = vsub_f32(v133, v103); + float32x2_t v166 = vadd_f32(v74, v164); + float32x2_t v167 = vsub_f32(v74, v164); + float32x2_t v168 = vadd_f32(v134, v104); + float32x2_t v169 = vsub_f32(v134, v104); + float32x2_t v216 = vadd_f32(v75, v165); + float32x2_t v217 = vsub_f32(v75, v165); + float32x2_t v218 = vadd_f32(v135, v105); + float32x2_t v219 = vsub_f32(v135, v105); + float32x2_t v270 = vadd_f32(v266, v268); + float32x2_t v271 = vsub_f32(v266, v268); + float32x2_t v272 = vadd_f32(v267, v269); + float32x2_t v291 = vrev64_f32(v267); + float32x2_t v305 = vrev64_f32(v269); + float32x2_t v320 = vadd_f32(v316, v318); + float32x2_t v321 = vsub_f32(v316, v318); + float32x2_t v322 = vadd_f32(v317, v319); + float32x2_t v348 = vmul_f32(v317, v347); + float32x2_t v356 = vmul_f32(v319, v355); + float32x2_t v170 = vadd_f32(v166, v168); + float32x2_t v171 = vsub_f32(v166, v168); + float32x2_t v172 = vadd_f32(v167, v169); + float32x2_t v191 = vrev64_f32(v167); + float32x2_t v205 = vrev64_f32(v169); + float32x2_t v220 = vadd_f32(v216, v218); + float32x2_t v221 = vsub_f32(v216, v218); + float32x2_t v222 = vadd_f32(v217, v219); + float32x2_t v241 = vrev64_f32(v217); + float32x2_t v255 = vrev64_f32(v219); + float32x2_t v273 = vadd_f32(v270, v29); + float32x2_t v281 = vmul_f32(v270, v280); + float32x2_t v285 = vmul_f32(v271, v284); + float32x2_t v292 = vmul_f32(v291, v290); + float32x2_t v298 = vrev64_f32(v272); + float32x2_t v306 = vmul_f32(v305, v304); + float32x2_t v323 = vadd_f32(v320, v43); + float32x2_t v336 = vrev64_f32(v320); + float32x2_t v343 = vrev64_f32(v321); + float32x2_t v352 = vmul_f32(v322, v351); + float32x2_t v173 = vadd_f32(v170, v44); + float32x2_t v181 = vmul_f32(v170, v280); + float32x2_t v185 = vmul_f32(v171, v284); + float32x2_t v192 = vmul_f32(v191, v290); + float32x2_t v198 = vrev64_f32(v172); + float32x2_t v206 = vmul_f32(v205, v304); + float32x2_t v223 = vadd_f32(v220, v45); + float32x2_t v231 = vmul_f32(v220, v280); + float32x2_t v235 = vmul_f32(v221, v284); + float32x2_t v242 = vmul_f32(v241, v290); + float32x2_t v248 = vrev64_f32(v222); + float32x2_t v256 = vmul_f32(v255, v304); + float32x2_t v299 = vmul_f32(v298, v297); + float32x2_t v307 = vadd_f32(v273, v281); + float32x2_t v329 = vrev64_f32(v323); + float32x2_t v337 = vmul_f32(v336, v335); + float32x2_t v344 = vmul_f32(v343, v342); + float32x2_t v360 = vsub_f32(v348, v352); + float32x2_t v361 = vadd_f32(v352, v356); + float32x2_t v199 = vmul_f32(v198, v297); + float32x2_t v207 = vadd_f32(v173, v181); + float32x2_t v249 = vmul_f32(v248, v297); + float32x2_t v257 = vadd_f32(v223, v231); + float32x2_t v308 = vadd_f32(v307, v285); + float32x2_t v309 = vsub_f32(v307, v285); + float32x2_t v310 = vsub_f32(v292, v299); + float32x2_t v311 = vadd_f32(v299, v306); + float32x2_t v330 = vmul_f32(v329, v328); + v6[0] = v173; + v6[ostride * 10] = v223; + float32x2_t v208 = vadd_f32(v207, v185); + float32x2_t v209 = vsub_f32(v207, v185); + float32x2_t v210 = vsub_f32(v192, v199); + float32x2_t v211 = vadd_f32(v199, v206); + float32x2_t v258 = vadd_f32(v257, v235); + float32x2_t v259 = vsub_f32(v257, v235); + float32x2_t v260 = vsub_f32(v242, v249); + float32x2_t v261 = vadd_f32(v249, v256); + float32x2_t v312 = vadd_f32(v308, v310); + float32x2_t v313 = vsub_f32(v308, v310); + float32x2_t v314 = vadd_f32(v309, v311); + float32x2_t v315 = vsub_f32(v309, v311); + float32x2_t v357 = vadd_f32(v330, v337); + float32x2_t v366 = vadd_f32(v273, v330); + float32x2_t v367 = vsub_f32(v273, v330); + float32x2_t v212 = vadd_f32(v208, v210); + float32x2_t v213 = vsub_f32(v208, v210); + float32x2_t v214 = vadd_f32(v209, v211); + float32x2_t v215 = vsub_f32(v209, v211); + float32x2_t v262 = vadd_f32(v258, v260); + float32x2_t v263 = vsub_f32(v258, v260); + float32x2_t v264 = vadd_f32(v259, v261); + float32x2_t v265 = vsub_f32(v259, v261); + float32x2_t v358 = vadd_f32(v357, v344); + float32x2_t v359 = vsub_f32(v357, v344); + v6[ostride * 5] = v367; + v6[ostride * 15] = v366; + float32x2_t v362 = vadd_f32(v358, v360); + float32x2_t v363 = vsub_f32(v358, v360); + float32x2_t v364 = vadd_f32(v359, v361); + float32x2_t v365 = vsub_f32(v359, v361); + v6[ostride * 16] = v213; + v6[ostride * 6] = v263; + v6[ostride * 12] = v215; + v6[ostride * 2] = v265; + v6[ostride * 8] = v214; + v6[ostride * 18] = v264; + v6[ostride * 4] = v212; + v6[ostride * 14] = v262; + float32x2_t v388 = vadd_f32(v313, v363); + float32x2_t v389 = vsub_f32(v313, v363); + float32x2_t v410 = vadd_f32(v315, v365); + float32x2_t v411 = vsub_f32(v315, v365); + float32x2_t v432 = vadd_f32(v314, v364); + float32x2_t v433 = vsub_f32(v314, v364); + float32x2_t v454 = vadd_f32(v312, v362); + float32x2_t v455 = vsub_f32(v312, v362); + v6[ostride] = v389; + v6[ostride * 11] = v388; + v6[ostride * 17] = v411; + v6[ostride * 7] = v410; + v6[ostride * 13] = v433; + v6[ostride * 3] = v432; + v6[ostride * 9] = v455; + v6[ostride * 19] = v454; + v5 += 1 * idist; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu20(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v328 = -1.2500000000000000e+00F; + float v333 = 5.5901699437494745e-01F; + float v376 = -1.0000000000000000e+00F; + float v383 = 1.2500000000000000e+00F; + float v390 = -5.5901699437494745e-01F; + float v397 = -1.5388417685876268e+00F; + float v402 = -5.8778525229247325e-01F; + float v407 = -3.6327126400268028e-01F; + const int32_t *v739 = &v5[v0]; + float32x2_t *v827 = &v6[v2]; + int64_t v27 = v0 * 10; + int64_t v37 = v0 * 5; + int64_t v45 = v0 * 15; + int64_t v57 = v0 * 4; + int64_t v65 = v0 * 14; + int64_t v75 = v0 * 9; + int64_t v83 = v0 * 19; + int64_t v95 = v0 * 8; + int64_t v103 = v0 * 18; + int64_t v113 = v0 * 13; + int64_t v121 = v0 * 3; + int64_t v133 = v0 * 12; + int64_t v141 = v0 * 2; + int64_t v151 = v0 * 17; + int64_t v159 = v0 * 7; + int64_t v171 = v0 * 16; + int64_t v179 = v0 * 6; + int64_t v197 = v0 * 11; + float v341 = v4 * v397; + float v348 = v4 * v402; + float v355 = v4 * v407; + float v379 = v4 * v376; + float v386 = v4 * v383; + float v393 = v4 * v390; + int64_t v430 = v2 * 5; + int64_t v437 = v2 * 10; + int64_t v444 = v2 * 15; + int64_t v453 = v2 * 16; + int64_t v467 = v2 * 6; + int64_t v474 = v2 * 11; + int64_t v483 = v2 * 12; + int64_t v490 = v2 * 17; + int64_t v497 = v2 * 2; + int64_t v504 = v2 * 7; + int64_t v513 = v2 * 8; + int64_t v520 = v2 * 13; + int64_t v527 = v2 * 18; + int64_t v534 = v2 * 3; + int64_t v543 = v2 * 4; + int64_t v550 = v2 * 9; + int64_t v557 = v2 * 14; + int64_t v564 = v2 * 19; + const int32_t *v577 = &v5[0]; + svint64_t v749 = svindex_s64(0, v1); + svfloat32_t v764 = svdup_n_f32(v328); + svfloat32_t v765 = svdup_n_f32(v333); + svfloat32_t v772 = svdup_n_f32(v397); + svfloat32_t v773 = svdup_n_f32(v402); + svfloat32_t v774 = svdup_n_f32(v407); + float32x2_t *v782 = &v6[0]; + svint16_t v579 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v577), v749)); + const int32_t *v586 = &v5[v27]; + const int32_t *v595 = &v5[v37]; + const int32_t *v604 = &v5[v45]; + const int32_t *v613 = &v5[v57]; + const int32_t *v622 = &v5[v65]; + const int32_t *v631 = &v5[v75]; + const int32_t *v640 = &v5[v83]; + const int32_t *v649 = &v5[v95]; + const int32_t *v658 = &v5[v103]; + const int32_t *v667 = &v5[v113]; + const int32_t *v676 = &v5[v121]; + const int32_t *v685 = &v5[v133]; + const int32_t *v694 = &v5[v141]; + const int32_t *v703 = &v5[v151]; + const int32_t *v712 = &v5[v159]; + const int32_t *v721 = &v5[v171]; + const int32_t *v730 = &v5[v179]; + svint16_t v741 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v739), v749)); + const int32_t *v748 = &v5[v197]; + svfloat32_t v766 = svdup_n_f32(v341); + svfloat32_t v767 = svdup_n_f32(v348); + svfloat32_t v768 = svdup_n_f32(v355); + svfloat32_t v769 = svdup_n_f32(v379); + svfloat32_t v770 = svdup_n_f32(v386); + svfloat32_t v771 = svdup_n_f32(v393); + float32x2_t *v791 = &v6[v430]; + float32x2_t *v800 = &v6[v437]; + float32x2_t *v809 = &v6[v444]; + float32x2_t *v818 = &v6[v453]; + float32x2_t *v836 = &v6[v467]; + float32x2_t *v845 = &v6[v474]; + float32x2_t *v854 = &v6[v483]; + float32x2_t *v863 = &v6[v490]; + float32x2_t *v872 = &v6[v497]; + float32x2_t *v881 = &v6[v504]; + float32x2_t *v890 = &v6[v513]; + float32x2_t *v899 = &v6[v520]; + float32x2_t *v908 = &v6[v527]; + float32x2_t *v917 = &v6[v534]; + float32x2_t *v926 = &v6[v543]; + float32x2_t *v935 = &v6[v550]; + float32x2_t *v944 = &v6[v557]; + float32x2_t *v953 = &v6[v564]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v579, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v195 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v741, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svint16_t v588 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v586), v749)); + svint16_t v597 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v595), v749)); + svint16_t v606 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v604), v749)); + svint16_t v615 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v613), v749)); + svint16_t v624 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v622), v749)); + svint16_t v633 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v631), v749)); + svint16_t v642 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v640), v749)); + svint16_t v651 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v649), v749)); + svint16_t v660 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v658), v749)); + svint16_t v669 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v667), v749)); + svint16_t v678 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v676), v749)); + svint16_t v687 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v685), v749)); + svint16_t v696 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v694), v749)); + svint16_t v705 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v703), v749)); + svint16_t v714 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v712), v749)); + svint16_t v723 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v721), v749)); + svint16_t v732 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v730), v749)); + svint16_t v750 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v748), v749)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v588, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v597, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v51 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v606, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v63 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v615, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v71 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v624, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v81 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v633, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v89 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v642, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v101 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v651, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v109 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v660, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v119 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v669, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v127 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v678, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v139 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v687, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v147 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v696, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v157 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v705, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v165 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v714, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v177 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v723, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v185 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v732, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v203 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v750, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v72 = svadd_f32_x(svptrue_b32(), v63, v71); + svfloat32_t v73 = svsub_f32_x(svptrue_b32(), v63, v71); + svfloat32_t v90 = svadd_f32_x(svptrue_b32(), v81, v89); + svfloat32_t v91 = svsub_f32_x(svptrue_b32(), v81, v89); + svfloat32_t v110 = svadd_f32_x(svptrue_b32(), v101, v109); + svfloat32_t v111 = svsub_f32_x(svptrue_b32(), v101, v109); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v119, v127); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v119, v127); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v139, v147); + svfloat32_t v149 = svsub_f32_x(svptrue_b32(), v139, v147); + svfloat32_t v166 = svadd_f32_x(svptrue_b32(), v157, v165); + svfloat32_t v167 = svsub_f32_x(svptrue_b32(), v157, v165); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v177, v185); + svfloat32_t v187 = svsub_f32_x(svptrue_b32(), v177, v185); + svfloat32_t v204 = svadd_f32_x(svptrue_b32(), v195, v203); + svfloat32_t v205 = svsub_f32_x(svptrue_b32(), v195, v203); + svfloat32_t v54 = svadd_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v55 = svsub_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v92 = svadd_f32_x(svptrue_b32(), v72, v90); + svfloat32_t v93 = svsub_f32_x(svptrue_b32(), v72, v90); + svfloat32_t v130 = svadd_f32_x(svptrue_b32(), v110, v128); + svfloat32_t v131 = svsub_f32_x(svptrue_b32(), v110, v128); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v148, v166); + svfloat32_t v169 = svsub_f32_x(svptrue_b32(), v148, v166); + svfloat32_t v206 = svadd_f32_x(svptrue_b32(), v186, v204); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v186, v204); + svfloat32_t v314 = svadd_f32_x(svptrue_b32(), v73, v187); + svfloat32_t v315 = svsub_f32_x(svptrue_b32(), v73, v187); + svfloat32_t v316 = svadd_f32_x(svptrue_b32(), v149, v111); + svfloat32_t v317 = svsub_f32_x(svptrue_b32(), v149, v111); + svfloat32_t v367 = svadd_f32_x(svptrue_b32(), v91, v205); + svfloat32_t v368 = svsub_f32_x(svptrue_b32(), v91, v205); + svfloat32_t v369 = svadd_f32_x(svptrue_b32(), v167, v129); + svfloat32_t v370 = svsub_f32_x(svptrue_b32(), v167, v129); + svfloat32_t v208 = svadd_f32_x(svptrue_b32(), v92, v206); + svfloat32_t v209 = svsub_f32_x(svptrue_b32(), v92, v206); + svfloat32_t v210 = svadd_f32_x(svptrue_b32(), v168, v130); + svfloat32_t v211 = svsub_f32_x(svptrue_b32(), v168, v130); + svfloat32_t v261 = svadd_f32_x(svptrue_b32(), v93, v207); + svfloat32_t v262 = svsub_f32_x(svptrue_b32(), v93, v207); + svfloat32_t v263 = svadd_f32_x(svptrue_b32(), v169, v131); + svfloat32_t v264 = svsub_f32_x(svptrue_b32(), v169, v131); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v314, v316); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v314, v316); + svfloat32_t v320 = svadd_f32_x(svptrue_b32(), v315, v317); + svfloat32_t zero343 = svdup_n_f32(0); + svfloat32_t v343 = svcmla_f32_x(pred_full, zero343, v766, v315, 90); + svfloat32_t v371 = svadd_f32_x(svptrue_b32(), v367, v369); + svfloat32_t v372 = svsub_f32_x(svptrue_b32(), v367, v369); + svfloat32_t v373 = svadd_f32_x(svptrue_b32(), v368, v370); + svfloat32_t v410 = svmul_f32_x(svptrue_b32(), v370, v774); + svfloat32_t v212 = svadd_f32_x(svptrue_b32(), v208, v210); + svfloat32_t v213 = svsub_f32_x(svptrue_b32(), v208, v210); + svfloat32_t v214 = svadd_f32_x(svptrue_b32(), v209, v211); + svfloat32_t zero237 = svdup_n_f32(0); + svfloat32_t v237 = svcmla_f32_x(pred_full, zero237, v766, v209, 90); + svfloat32_t v265 = svadd_f32_x(svptrue_b32(), v261, v263); + svfloat32_t v266 = svsub_f32_x(svptrue_b32(), v261, v263); + svfloat32_t v267 = svadd_f32_x(svptrue_b32(), v262, v264); + svfloat32_t zero290 = svdup_n_f32(0); + svfloat32_t v290 = svcmla_f32_x(pred_full, zero290, v766, v262, 90); + svfloat32_t v321 = svadd_f32_x(svptrue_b32(), v318, v35); + svfloat32_t zero350 = svdup_n_f32(0); + svfloat32_t v350 = svcmla_f32_x(pred_full, zero350, v767, v320, 90); + svfloat32_t v374 = svadd_f32_x(svptrue_b32(), v371, v53); + svfloat32_t zero395 = svdup_n_f32(0); + svfloat32_t v395 = svcmla_f32_x(pred_full, zero395, v771, v372, 90); + svfloat32_t v405 = svmul_f32_x(svptrue_b32(), v373, v773); + svfloat32_t v215 = svadd_f32_x(svptrue_b32(), v212, v54); + svfloat32_t zero244 = svdup_n_f32(0); + svfloat32_t v244 = svcmla_f32_x(pred_full, zero244, v767, v214, 90); + svfloat32_t v268 = svadd_f32_x(svptrue_b32(), v265, v55); + svfloat32_t zero297 = svdup_n_f32(0); + svfloat32_t v297 = svcmla_f32_x(pred_full, zero297, v767, v267, 90); + svfloat32_t v358 = svmla_f32_x(pred_full, v321, v318, v764); + svfloat32_t v361 = svsub_f32_x(svptrue_b32(), v343, v350); + svfloat32_t v362 = svcmla_f32_x(pred_full, v350, v768, v317, 90); + svfloat32_t zero381 = svdup_n_f32(0); + svfloat32_t v381 = svcmla_f32_x(pred_full, zero381, v769, v374, 90); + svfloat32_t v414 = svnmls_f32_x(pred_full, v405, v368, v772); + svfloat32_t v415 = svmla_f32_x(pred_full, v410, v373, v773); + svfloat32_t v252 = svmla_f32_x(pred_full, v215, v212, v764); + svfloat32_t v255 = svsub_f32_x(svptrue_b32(), v237, v244); + svfloat32_t v256 = svcmla_f32_x(pred_full, v244, v768, v211, 90); + svfloat32_t v305 = svmla_f32_x(pred_full, v268, v265, v764); + svfloat32_t v308 = svsub_f32_x(svptrue_b32(), v290, v297); + svfloat32_t v309 = svcmla_f32_x(pred_full, v297, v768, v264, 90); + svfloat32_t v359 = svmla_f32_x(pred_full, v358, v319, v765); + svfloat32_t v360 = svmls_f32_x(pred_full, v358, v319, v765); + svfloat32_t v411 = svcmla_f32_x(pred_full, v381, v770, v371, 90); + svfloat32_t v420 = svadd_f32_x(svptrue_b32(), v321, v381); + svfloat32_t v421 = svsub_f32_x(svptrue_b32(), v321, v381); + svst1_f64(pred_full, (double *)(v782), svreinterpret_f64_f32(v215)); + svst1_f64(pred_full, (double *)(v800), svreinterpret_f64_f32(v268)); + svfloat32_t v253 = svmla_f32_x(pred_full, v252, v213, v765); + svfloat32_t v254 = svmls_f32_x(pred_full, v252, v213, v765); + svfloat32_t v306 = svmla_f32_x(pred_full, v305, v266, v765); + svfloat32_t v307 = svmls_f32_x(pred_full, v305, v266, v765); + svfloat32_t v363 = svadd_f32_x(svptrue_b32(), v359, v361); + svfloat32_t v364 = svsub_f32_x(svptrue_b32(), v359, v361); + svfloat32_t v365 = svadd_f32_x(svptrue_b32(), v360, v362); + svfloat32_t v366 = svsub_f32_x(svptrue_b32(), v360, v362); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v411, v395); + svfloat32_t v413 = svsub_f32_x(svptrue_b32(), v411, v395); + svst1_f64(pred_full, (double *)(v791), svreinterpret_f64_f32(v421)); + svst1_f64(pred_full, (double *)(v809), svreinterpret_f64_f32(v420)); + svfloat32_t v257 = svadd_f32_x(svptrue_b32(), v253, v255); + svfloat32_t v258 = svsub_f32_x(svptrue_b32(), v253, v255); + svfloat32_t v259 = svadd_f32_x(svptrue_b32(), v254, v256); + svfloat32_t v260 = svsub_f32_x(svptrue_b32(), v254, v256); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v306, v308); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v306, v308); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v307, v309); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v307, v309); + svfloat32_t v416 = svadd_f32_x(svptrue_b32(), v412, v414); + svfloat32_t v417 = svsub_f32_x(svptrue_b32(), v412, v414); + svfloat32_t v418 = svadd_f32_x(svptrue_b32(), v413, v415); + svfloat32_t v419 = svsub_f32_x(svptrue_b32(), v413, v415); + svfloat32_t v450 = svadd_f32_x(svptrue_b32(), v364, v417); + svfloat32_t v451 = svsub_f32_x(svptrue_b32(), v364, v417); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v366, v419); + svfloat32_t v481 = svsub_f32_x(svptrue_b32(), v366, v419); + svfloat32_t v510 = svadd_f32_x(svptrue_b32(), v365, v418); + svfloat32_t v511 = svsub_f32_x(svptrue_b32(), v365, v418); + svfloat32_t v540 = svadd_f32_x(svptrue_b32(), v363, v416); + svfloat32_t v541 = svsub_f32_x(svptrue_b32(), v363, v416); + svst1_f64(pred_full, (double *)(v818), svreinterpret_f64_f32(v258)); + svst1_f64(pred_full, (double *)(v836), svreinterpret_f64_f32(v311)); + svst1_f64(pred_full, (double *)(v854), svreinterpret_f64_f32(v260)); + svst1_f64(pred_full, (double *)(v872), svreinterpret_f64_f32(v313)); + svst1_f64(pred_full, (double *)(v890), svreinterpret_f64_f32(v259)); + svst1_f64(pred_full, (double *)(v908), svreinterpret_f64_f32(v312)); + svst1_f64(pred_full, (double *)(v926), svreinterpret_f64_f32(v257)); + svst1_f64(pred_full, (double *)(v944), svreinterpret_f64_f32(v310)); + svst1_f64(pred_full, (double *)(v827), svreinterpret_f64_f32(v451)); + svst1_f64(pred_full, (double *)(v845), svreinterpret_f64_f32(v450)); + svst1_f64(pred_full, (double *)(v863), svreinterpret_f64_f32(v481)); + svst1_f64(pred_full, (double *)(v881), svreinterpret_f64_f32(v480)); + svst1_f64(pred_full, (double *)(v899), svreinterpret_f64_f32(v511)); + svst1_f64(pred_full, (double *)(v917), svreinterpret_f64_f32(v510)); + svst1_f64(pred_full, (double *)(v935), svreinterpret_f64_f32(v541)); + svst1_f64(pred_full, (double *)(v953), svreinterpret_f64_f32(v540)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu21(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v125 = vld1s_s16(&v5[istride]); + float v185 = -1.1666666666666665e+00F; + float v189 = 7.9015646852540022e-01F; + float v193 = 5.5854267289647742e-02F; + float v197 = 7.3430220123575241e-01F; + float v200 = 4.4095855184409838e-01F; + float v201 = -4.4095855184409838e-01F; + float v207 = 3.4087293062393137e-01F; + float v208 = -3.4087293062393137e-01F; + float v214 = -5.3396936033772524e-01F; + float v215 = 5.3396936033772524e-01F; + float v221 = 8.7484229096165667e-01F; + float v222 = -8.7484229096165667e-01F; + float v265 = -1.4999999999999998e+00F; + float v269 = 1.7499999999999996e+00F; + float v273 = -1.1852347027881001e+00F; + float v277 = -8.3781400934471603e-02F; + float v281 = -1.1014533018536286e+00F; + float v284 = -6.6143782776614746e-01F; + float v285 = 6.6143782776614746e-01F; + float v291 = -5.1130939593589697e-01F; + float v292 = 5.1130939593589697e-01F; + float v298 = 8.0095404050658769e-01F; + float v299 = -8.0095404050658769e-01F; + float v305 = -1.3122634364424848e+00F; + float v306 = 1.3122634364424848e+00F; + float v348 = 8.6602540378443871e-01F; + float v349 = -8.6602540378443871e-01F; + float v355 = -1.0103629710818451e+00F; + float v356 = 1.0103629710818451e+00F; + float v362 = 6.8429557470759583e-01F; + float v363 = -6.8429557470759583e-01F; + float v369 = 4.8371214382601155e-02F; + float v370 = -4.8371214382601155e-02F; + float v376 = 6.3592436032499466e-01F; + float v377 = -6.3592436032499466e-01F; + float32x2_t v379 = (float32x2_t){v4, v4}; + float v384 = -3.8188130791298663e-01F; + float v388 = -2.9520461738277515e-01F; + float v392 = 4.6243103089499693e-01F; + float v396 = -7.5763564827777208e-01F; + int16x4_t v34 = vld1s_s16(&v5[0]); + float32x2_t v126 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v125)), 15); + float32x2_t v186 = (float32x2_t){v185, v185}; + float32x2_t v190 = (float32x2_t){v189, v189}; + float32x2_t v194 = (float32x2_t){v193, v193}; + float32x2_t v198 = (float32x2_t){v197, v197}; + float32x2_t v202 = (float32x2_t){v200, v201}; + float32x2_t v209 = (float32x2_t){v207, v208}; + float32x2_t v216 = (float32x2_t){v214, v215}; + float32x2_t v223 = (float32x2_t){v221, v222}; + float32x2_t v266 = (float32x2_t){v265, v265}; + float32x2_t v270 = (float32x2_t){v269, v269}; + float32x2_t v274 = (float32x2_t){v273, v273}; + float32x2_t v278 = (float32x2_t){v277, v277}; + float32x2_t v282 = (float32x2_t){v281, v281}; + float32x2_t v286 = (float32x2_t){v284, v285}; + float32x2_t v293 = (float32x2_t){v291, v292}; + float32x2_t v300 = (float32x2_t){v298, v299}; + float32x2_t v307 = (float32x2_t){v305, v306}; + float32x2_t v350 = (float32x2_t){v348, v349}; + float32x2_t v357 = (float32x2_t){v355, v356}; + float32x2_t v364 = (float32x2_t){v362, v363}; + float32x2_t v371 = (float32x2_t){v369, v370}; + float32x2_t v378 = (float32x2_t){v376, v377}; + float32x2_t v385 = (float32x2_t){v384, v384}; + float32x2_t v389 = (float32x2_t){v388, v388}; + float32x2_t v393 = (float32x2_t){v392, v392}; + float32x2_t v397 = (float32x2_t){v396, v396}; + int16x4_t v20 = vld1s_s16(&v5[istride * 7]); + int16x4_t v26 = vld1s_s16(&v5[istride * 14]); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + int16x4_t v41 = vld1s_s16(&v5[istride * 10]); + int16x4_t v47 = vld1s_s16(&v5[istride * 17]); + int16x4_t v55 = vld1s_s16(&v5[istride * 3]); + int16x4_t v62 = vld1s_s16(&v5[istride * 13]); + int16x4_t v68 = vld1s_s16(&v5[istride * 20]); + int16x4_t v76 = vld1s_s16(&v5[istride * 6]); + int16x4_t v83 = vld1s_s16(&v5[istride * 16]); + int16x4_t v89 = vld1s_s16(&v5[istride * 2]); + int16x4_t v97 = vld1s_s16(&v5[istride * 9]); + int16x4_t v104 = vld1s_s16(&v5[istride * 19]); + int16x4_t v110 = vld1s_s16(&v5[istride * 5]); + int16x4_t v118 = vld1s_s16(&v5[istride * 12]); + int16x4_t v131 = vld1s_s16(&v5[istride * 8]); + int16x4_t v139 = vld1s_s16(&v5[istride * 15]); + int16x4_t v146 = vld1s_s16(&v5[istride * 4]); + int16x4_t v152 = vld1s_s16(&v5[istride * 11]); + int16x4_t v160 = vld1s_s16(&v5[istride * 18]); + float32x2_t v204 = vmul_f32(v379, v202); + float32x2_t v211 = vmul_f32(v379, v209); + float32x2_t v218 = vmul_f32(v379, v216); + float32x2_t v225 = vmul_f32(v379, v223); + float32x2_t v288 = vmul_f32(v379, v286); + float32x2_t v295 = vmul_f32(v379, v293); + float32x2_t v302 = vmul_f32(v379, v300); + float32x2_t v309 = vmul_f32(v379, v307); + float32x2_t v352 = vmul_f32(v379, v350); + float32x2_t v359 = vmul_f32(v379, v357); + float32x2_t v366 = vmul_f32(v379, v364); + float32x2_t v373 = vmul_f32(v379, v371); + float32x2_t v380 = vmul_f32(v379, v378); + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v42 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v41)), 15); + float32x2_t v48 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v47)), 15); + float32x2_t v56 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v55)), 15); + float32x2_t v63 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v62)), 15); + float32x2_t v69 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v68)), 15); + float32x2_t v77 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v76)), 15); + float32x2_t v84 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v83)), 15); + float32x2_t v90 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v89)), 15); + float32x2_t v98 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v97)), 15); + float32x2_t v105 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v104)), 15); + float32x2_t v111 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v110)), 15); + float32x2_t v119 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v118)), 15); + float32x2_t v132 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v131)), 15); + float32x2_t v140 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v139)), 15); + float32x2_t v147 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v146)), 15); + float32x2_t v153 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v152)), 15); + float32x2_t v161 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v160)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v49 = vadd_f32(v42, v48); + float32x2_t v50 = vsub_f32(v42, v48); + float32x2_t v70 = vadd_f32(v63, v69); + float32x2_t v71 = vsub_f32(v63, v69); + float32x2_t v91 = vadd_f32(v84, v90); + float32x2_t v92 = vsub_f32(v84, v90); + float32x2_t v112 = vadd_f32(v105, v111); + float32x2_t v113 = vsub_f32(v105, v111); + float32x2_t v133 = vadd_f32(v126, v132); + float32x2_t v134 = vsub_f32(v126, v132); + float32x2_t v154 = vadd_f32(v147, v153); + float32x2_t v155 = vsub_f32(v147, v153); + float32x2_t v36 = vadd_f32(v28, v35); + float32x2_t v57 = vadd_f32(v49, v56); + float32x2_t v78 = vadd_f32(v70, v77); + float32x2_t v99 = vadd_f32(v91, v98); + float32x2_t v120 = vadd_f32(v112, v119); + float32x2_t v141 = vadd_f32(v133, v140); + float32x2_t v162 = vadd_f32(v154, v161); + float32x2_t v247 = vadd_f32(v49, v154); + float32x2_t v248 = vsub_f32(v49, v154); + float32x2_t v249 = vadd_f32(v112, v91); + float32x2_t v250 = vsub_f32(v112, v91); + float32x2_t v251 = vadd_f32(v70, v133); + float32x2_t v252 = vsub_f32(v70, v133); + float32x2_t v331 = vadd_f32(v50, v155); + float32x2_t v332 = vsub_f32(v50, v155); + float32x2_t v333 = vadd_f32(v113, v92); + float32x2_t v334 = vsub_f32(v113, v92); + float32x2_t v335 = vadd_f32(v71, v134); + float32x2_t v336 = vsub_f32(v71, v134); + float32x2_t v163 = vadd_f32(v57, v162); + float32x2_t v164 = vsub_f32(v57, v162); + float32x2_t v165 = vadd_f32(v120, v99); + float32x2_t v166 = vsub_f32(v120, v99); + float32x2_t v167 = vadd_f32(v78, v141); + float32x2_t v168 = vsub_f32(v78, v141); + float32x2_t v253 = vadd_f32(v247, v249); + float32x2_t v256 = vsub_f32(v247, v249); + float32x2_t v257 = vsub_f32(v249, v251); + float32x2_t v258 = vsub_f32(v251, v247); + float32x2_t v259 = vadd_f32(v248, v250); + float32x2_t v261 = vsub_f32(v248, v250); + float32x2_t v262 = vsub_f32(v250, v252); + float32x2_t v263 = vsub_f32(v252, v248); + float32x2_t v337 = vadd_f32(v331, v333); + float32x2_t v340 = vsub_f32(v331, v333); + float32x2_t v341 = vsub_f32(v333, v335); + float32x2_t v342 = vsub_f32(v335, v331); + float32x2_t v343 = vadd_f32(v332, v334); + float32x2_t v345 = vsub_f32(v332, v334); + float32x2_t v346 = vsub_f32(v334, v336); + float32x2_t v347 = vsub_f32(v336, v332); + float32x2_t v169 = vadd_f32(v163, v165); + float32x2_t v172 = vsub_f32(v163, v165); + float32x2_t v173 = vsub_f32(v165, v167); + float32x2_t v174 = vsub_f32(v167, v163); + float32x2_t v175 = vadd_f32(v164, v166); + float32x2_t v177 = vsub_f32(v164, v166); + float32x2_t v178 = vsub_f32(v166, v168); + float32x2_t v179 = vsub_f32(v168, v164); + float32x2_t v254 = vadd_f32(v253, v251); + float32x2_t v260 = vadd_f32(v259, v252); + float32x2_t v275 = vmul_f32(v256, v274); + float32x2_t v279 = vmul_f32(v257, v278); + float32x2_t v283 = vmul_f32(v258, v282); + float32x2_t v296 = vrev64_f32(v261); + float32x2_t v303 = vrev64_f32(v262); + float32x2_t v310 = vrev64_f32(v263); + float32x2_t v338 = vadd_f32(v337, v335); + float32x2_t v344 = vadd_f32(v343, v336); + float32x2_t v367 = vrev64_f32(v340); + float32x2_t v374 = vrev64_f32(v341); + float32x2_t v381 = vrev64_f32(v342); + float32x2_t v390 = vmul_f32(v345, v389); + float32x2_t v394 = vmul_f32(v346, v393); + float32x2_t v398 = vmul_f32(v347, v397); + float32x2_t v170 = vadd_f32(v169, v167); + float32x2_t v176 = vadd_f32(v175, v168); + float32x2_t v191 = vmul_f32(v172, v190); + float32x2_t v195 = vmul_f32(v173, v194); + float32x2_t v199 = vmul_f32(v174, v198); + float32x2_t v212 = vrev64_f32(v177); + float32x2_t v219 = vrev64_f32(v178); + float32x2_t v226 = vrev64_f32(v179); + float32x2_t v255 = vadd_f32(v254, v28); + float32x2_t v271 = vmul_f32(v254, v270); + float32x2_t v289 = vrev64_f32(v260); + float32x2_t v297 = vmul_f32(v296, v295); + float32x2_t v304 = vmul_f32(v303, v302); + float32x2_t v311 = vmul_f32(v310, v309); + float32x2_t v339 = vadd_f32(v338, v29); + float32x2_t v360 = vrev64_f32(v338); + float32x2_t v368 = vmul_f32(v367, v366); + float32x2_t v375 = vmul_f32(v374, v373); + float32x2_t v382 = vmul_f32(v381, v380); + float32x2_t v386 = vmul_f32(v344, v385); + float32x2_t v171 = vadd_f32(v170, v36); + float32x2_t v187 = vmul_f32(v170, v186); + float32x2_t v205 = vrev64_f32(v176); + float32x2_t v213 = vmul_f32(v212, v211); + float32x2_t v220 = vmul_f32(v219, v218); + float32x2_t v227 = vmul_f32(v226, v225); + float32x2_t v267 = vmul_f32(v255, v266); + float32x2_t v290 = vmul_f32(v289, v288); + float32x2_t v353 = vrev64_f32(v339); + float32x2_t v361 = vmul_f32(v360, v359); + float32x2_t v406 = vadd_f32(v386, v390); + float32x2_t v408 = vsub_f32(v386, v390); + float32x2_t v410 = vsub_f32(v386, v394); + float32x2_t v206 = vmul_f32(v205, v204); + float32x2_t v228 = vadd_f32(v171, v187); + float32x2_t v312 = vadd_f32(v267, v271); + float32x2_t v319 = vadd_f32(v290, v297); + float32x2_t v321 = vsub_f32(v290, v297); + float32x2_t v323 = vsub_f32(v290, v304); + float32x2_t v354 = vmul_f32(v353, v352); + float32x2_t v407 = vadd_f32(v406, v394); + float32x2_t v409 = vsub_f32(v408, v398); + float32x2_t v411 = vadd_f32(v410, v398); + float32x2_t v418 = vadd_f32(v171, v267); + v6[0] = v171; + float32x2_t v229 = vadd_f32(v228, v191); + float32x2_t v231 = vsub_f32(v228, v191); + float32x2_t v233 = vsub_f32(v228, v195); + float32x2_t v235 = vadd_f32(v206, v213); + float32x2_t v237 = vsub_f32(v206, v213); + float32x2_t v239 = vsub_f32(v206, v220); + float32x2_t v313 = vadd_f32(v312, v275); + float32x2_t v315 = vsub_f32(v312, v275); + float32x2_t v317 = vsub_f32(v312, v279); + float32x2_t v320 = vadd_f32(v319, v304); + float32x2_t v322 = vsub_f32(v321, v311); + float32x2_t v324 = vadd_f32(v323, v311); + float32x2_t v399 = vadd_f32(v354, v361); + float32x2_t v419 = vadd_f32(v418, v354); + float32x2_t v420 = vsub_f32(v418, v354); + float32x2_t v230 = vadd_f32(v229, v195); + float32x2_t v232 = vsub_f32(v231, v199); + float32x2_t v234 = vadd_f32(v233, v199); + float32x2_t v236 = vadd_f32(v235, v220); + float32x2_t v238 = vsub_f32(v237, v227); + float32x2_t v240 = vadd_f32(v239, v227); + float32x2_t v314 = vadd_f32(v313, v279); + float32x2_t v316 = vsub_f32(v315, v283); + float32x2_t v318 = vadd_f32(v317, v283); + float32x2_t v400 = vadd_f32(v399, v368); + float32x2_t v402 = vsub_f32(v399, v368); + float32x2_t v404 = vsub_f32(v399, v375); + v6[ostride * 7] = v420; + v6[ostride * 14] = v419; + float32x2_t v241 = vadd_f32(v230, v236); + float32x2_t v242 = vsub_f32(v230, v236); + float32x2_t v243 = vadd_f32(v232, v238); + float32x2_t v244 = vsub_f32(v232, v238); + float32x2_t v245 = vadd_f32(v234, v240); + float32x2_t v246 = vsub_f32(v234, v240); + float32x2_t v325 = vadd_f32(v314, v320); + float32x2_t v326 = vsub_f32(v314, v320); + float32x2_t v327 = vadd_f32(v316, v322); + float32x2_t v328 = vsub_f32(v316, v322); + float32x2_t v329 = vadd_f32(v318, v324); + float32x2_t v330 = vsub_f32(v318, v324); + float32x2_t v401 = vadd_f32(v400, v375); + float32x2_t v403 = vsub_f32(v402, v382); + float32x2_t v405 = vadd_f32(v404, v382); + float32x2_t v412 = vadd_f32(v401, v407); + float32x2_t v413 = vsub_f32(v401, v407); + float32x2_t v414 = vadd_f32(v403, v409); + float32x2_t v415 = vsub_f32(v403, v409); + float32x2_t v416 = vadd_f32(v405, v411); + float32x2_t v417 = vsub_f32(v405, v411); + float32x2_t v436 = vadd_f32(v242, v326); + v6[ostride * 15] = v242; + float32x2_t v454 = vadd_f32(v244, v328); + v6[ostride * 9] = v244; + float32x2_t v472 = vadd_f32(v245, v329); + v6[ostride * 3] = v245; + float32x2_t v490 = vadd_f32(v246, v330); + v6[ostride * 18] = v246; + float32x2_t v508 = vadd_f32(v243, v327); + v6[ostride * 12] = v243; + float32x2_t v526 = vadd_f32(v241, v325); + v6[ostride * 6] = v241; + float32x2_t v437 = vadd_f32(v436, v413); + float32x2_t v438 = vsub_f32(v436, v413); + float32x2_t v455 = vadd_f32(v454, v415); + float32x2_t v456 = vsub_f32(v454, v415); + float32x2_t v473 = vadd_f32(v472, v416); + float32x2_t v474 = vsub_f32(v472, v416); + float32x2_t v491 = vadd_f32(v490, v417); + float32x2_t v492 = vsub_f32(v490, v417); + float32x2_t v509 = vadd_f32(v508, v414); + float32x2_t v510 = vsub_f32(v508, v414); + float32x2_t v527 = vadd_f32(v526, v412); + float32x2_t v528 = vsub_f32(v526, v412); + v6[ostride] = v438; + v6[ostride * 8] = v437; + v6[ostride * 16] = v456; + v6[ostride * 2] = v455; + v6[ostride * 10] = v474; + v6[ostride * 17] = v473; + v6[ostride * 4] = v492; + v6[ostride * 11] = v491; + v6[ostride * 19] = v510; + v6[ostride * 5] = v509; + v6[ostride * 13] = v528; + v6[ostride * 20] = v527; + v5 += 1 * idist; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu21(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v230 = -1.1666666666666665e+00F; + float v235 = 7.9015646852540022e-01F; + float v240 = 5.5854267289647742e-02F; + float v245 = 7.3430220123575241e-01F; + float v250 = -4.4095855184409838e-01F; + float v257 = -3.4087293062393137e-01F; + float v264 = 5.3396936033772524e-01F; + float v271 = -8.7484229096165667e-01F; + float v314 = -1.4999999999999998e+00F; + float v319 = 1.7499999999999996e+00F; + float v324 = -1.1852347027881001e+00F; + float v329 = -8.3781400934471603e-02F; + float v334 = -1.1014533018536286e+00F; + float v339 = 6.6143782776614746e-01F; + float v346 = 5.1130939593589697e-01F; + float v353 = -8.0095404050658769e-01F; + float v360 = 1.3122634364424848e+00F; + float v403 = -8.6602540378443871e-01F; + float v410 = 1.0103629710818451e+00F; + float v417 = -6.8429557470759583e-01F; + float v424 = -4.8371214382601155e-02F; + float v431 = -6.3592436032499466e-01F; + float v438 = -3.8188130791298663e-01F; + float v443 = -2.9520461738277515e-01F; + float v448 = 4.6243103089499693e-01F; + float v453 = -7.5763564827777208e-01F; + const int32_t *v786 = &v5[v0]; + float32x2_t *v904 = &v6[v2]; + int64_t v19 = v0 * 7; + int64_t v27 = v0 * 14; + int64_t v46 = v0 * 10; + int64_t v54 = v0 * 17; + int64_t v64 = v0 * 3; + int64_t v73 = v0 * 13; + int64_t v81 = v0 * 20; + int64_t v91 = v0 * 6; + int64_t v100 = v0 * 16; + int64_t v108 = v0 * 2; + int64_t v118 = v0 * 9; + int64_t v127 = v0 * 19; + int64_t v135 = v0 * 5; + int64_t v145 = v0 * 12; + int64_t v162 = v0 * 8; + int64_t v172 = v0 * 15; + int64_t v181 = v0 * 4; + int64_t v189 = v0 * 11; + int64_t v199 = v0 * 18; + float v253 = v4 * v250; + float v260 = v4 * v257; + float v267 = v4 * v264; + float v274 = v4 * v271; + float v342 = v4 * v339; + float v349 = v4 * v346; + float v356 = v4 * v353; + float v363 = v4 * v360; + float v406 = v4 * v403; + float v413 = v4 * v410; + float v420 = v4 * v417; + float v427 = v4 * v424; + float v434 = v4 * v431; + int64_t v487 = v2 * 7; + int64_t v494 = v2 * 14; + int64_t v504 = v2 * 15; + int64_t v518 = v2 * 8; + int64_t v528 = v2 * 9; + int64_t v535 = v2 * 16; + int64_t v542 = v2 * 2; + int64_t v552 = v2 * 3; + int64_t v559 = v2 * 10; + int64_t v566 = v2 * 17; + int64_t v576 = v2 * 18; + int64_t v583 = v2 * 4; + int64_t v590 = v2 * 11; + int64_t v600 = v2 * 12; + int64_t v607 = v2 * 19; + int64_t v614 = v2 * 5; + int64_t v624 = v2 * 6; + int64_t v631 = v2 * 13; + int64_t v638 = v2 * 20; + const int32_t *v669 = &v5[0]; + svint64_t v832 = svindex_s64(0, v1); + svfloat32_t v835 = svdup_n_f32(v230); + svfloat32_t v836 = svdup_n_f32(v235); + svfloat32_t v837 = svdup_n_f32(v240); + svfloat32_t v838 = svdup_n_f32(v245); + svfloat32_t v843 = svdup_n_f32(v314); + svfloat32_t v844 = svdup_n_f32(v319); + svfloat32_t v845 = svdup_n_f32(v324); + svfloat32_t v846 = svdup_n_f32(v329); + svfloat32_t v847 = svdup_n_f32(v334); + svfloat32_t v857 = svdup_n_f32(v438); + svfloat32_t v858 = svdup_n_f32(v443); + svfloat32_t v859 = svdup_n_f32(v448); + svfloat32_t v860 = svdup_n_f32(v453); + float32x2_t *v868 = &v6[0]; + const int32_t *v650 = &v5[v19]; + const int32_t *v659 = &v5[v27]; + svint16_t v671 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v669), v832)); + const int32_t *v678 = &v5[v46]; + const int32_t *v687 = &v5[v54]; + const int32_t *v696 = &v5[v64]; + const int32_t *v705 = &v5[v73]; + const int32_t *v714 = &v5[v81]; + const int32_t *v723 = &v5[v91]; + const int32_t *v732 = &v5[v100]; + const int32_t *v741 = &v5[v108]; + const int32_t *v750 = &v5[v118]; + const int32_t *v759 = &v5[v127]; + const int32_t *v768 = &v5[v135]; + const int32_t *v777 = &v5[v145]; + svint16_t v788 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v786), v832)); + const int32_t *v795 = &v5[v162]; + const int32_t *v804 = &v5[v172]; + const int32_t *v813 = &v5[v181]; + const int32_t *v822 = &v5[v189]; + const int32_t *v831 = &v5[v199]; + svfloat32_t v839 = svdup_n_f32(v253); + svfloat32_t v840 = svdup_n_f32(v260); + svfloat32_t v841 = svdup_n_f32(v267); + svfloat32_t v842 = svdup_n_f32(v274); + svfloat32_t v848 = svdup_n_f32(v342); + svfloat32_t v849 = svdup_n_f32(v349); + svfloat32_t v850 = svdup_n_f32(v356); + svfloat32_t v851 = svdup_n_f32(v363); + svfloat32_t v852 = svdup_n_f32(v406); + svfloat32_t v853 = svdup_n_f32(v413); + svfloat32_t v854 = svdup_n_f32(v420); + svfloat32_t v855 = svdup_n_f32(v427); + svfloat32_t v856 = svdup_n_f32(v434); + float32x2_t *v877 = &v6[v487]; + float32x2_t *v886 = &v6[v494]; + float32x2_t *v895 = &v6[v504]; + float32x2_t *v913 = &v6[v518]; + float32x2_t *v922 = &v6[v528]; + float32x2_t *v931 = &v6[v535]; + float32x2_t *v940 = &v6[v542]; + float32x2_t *v949 = &v6[v552]; + float32x2_t *v958 = &v6[v559]; + float32x2_t *v967 = &v6[v566]; + float32x2_t *v976 = &v6[v576]; + float32x2_t *v985 = &v6[v583]; + float32x2_t *v994 = &v6[v590]; + float32x2_t *v1003 = &v6[v600]; + float32x2_t *v1012 = &v6[v607]; + float32x2_t *v1021 = &v6[v614]; + float32x2_t *v1030 = &v6[v624]; + float32x2_t *v1039 = &v6[v631]; + float32x2_t *v1048 = &v6[v638]; + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v671, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v160 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v788, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svint16_t v652 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v650), v832)); + svint16_t v661 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v659), v832)); + svint16_t v680 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v678), v832)); + svint16_t v689 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v687), v832)); + svint16_t v698 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v696), v832)); + svint16_t v707 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v705), v832)); + svint16_t v716 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v714), v832)); + svint16_t v725 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v723), v832)); + svint16_t v734 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v732), v832)); + svint16_t v743 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v741), v832)); + svint16_t v752 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v750), v832)); + svint16_t v761 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v759), v832)); + svint16_t v770 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v768), v832)); + svint16_t v779 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v777), v832)); + svint16_t v797 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v795), v832)); + svint16_t v806 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v804), v832)); + svint16_t v815 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v813), v832)); + svint16_t v824 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v822), v832)); + svint16_t v833 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v831), v832)); + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v652, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v661, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v52 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v680, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v60 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v689, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v70 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v698, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v79 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v707, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v87 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v716, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v97 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v725, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v106 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v734, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v114 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v743, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v124 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v752, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v133 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v761, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v141 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v770, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v151 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v779, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v168 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v797, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v178 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v806, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v187 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v815, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v195 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v824, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v205 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v833, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v61 = svadd_f32_x(svptrue_b32(), v52, v60); + svfloat32_t v62 = svsub_f32_x(svptrue_b32(), v52, v60); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v115 = svadd_f32_x(svptrue_b32(), v106, v114); + svfloat32_t v116 = svsub_f32_x(svptrue_b32(), v106, v114); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v169 = svadd_f32_x(svptrue_b32(), v160, v168); + svfloat32_t v170 = svsub_f32_x(svptrue_b32(), v160, v168); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v187, v195); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v187, v195); + svfloat32_t v44 = svadd_f32_x(svptrue_b32(), v34, v43); + svfloat32_t v71 = svadd_f32_x(svptrue_b32(), v61, v70); + svfloat32_t v98 = svadd_f32_x(svptrue_b32(), v88, v97); + svfloat32_t v125 = svadd_f32_x(svptrue_b32(), v115, v124); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v142, v151); + svfloat32_t v179 = svadd_f32_x(svptrue_b32(), v169, v178); + svfloat32_t v206 = svadd_f32_x(svptrue_b32(), v196, v205); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v61, v196); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v61, v196); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v142, v115); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v142, v115); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v88, v169); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v88, v169); + svfloat32_t v385 = svadd_f32_x(svptrue_b32(), v62, v197); + svfloat32_t v386 = svsub_f32_x(svptrue_b32(), v62, v197); + svfloat32_t v387 = svadd_f32_x(svptrue_b32(), v143, v116); + svfloat32_t v388 = svsub_f32_x(svptrue_b32(), v143, v116); + svfloat32_t v389 = svadd_f32_x(svptrue_b32(), v89, v170); + svfloat32_t v390 = svsub_f32_x(svptrue_b32(), v89, v170); + svfloat32_t v207 = svadd_f32_x(svptrue_b32(), v71, v206); + svfloat32_t v208 = svsub_f32_x(svptrue_b32(), v71, v206); + svfloat32_t v209 = svadd_f32_x(svptrue_b32(), v152, v125); + svfloat32_t v210 = svsub_f32_x(svptrue_b32(), v152, v125); + svfloat32_t v211 = svadd_f32_x(svptrue_b32(), v98, v179); + svfloat32_t v212 = svsub_f32_x(svptrue_b32(), v98, v179); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v296, v298); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v296, v298); + svfloat32_t v306 = svsub_f32_x(svptrue_b32(), v298, v300); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v300, v296); + svfloat32_t v308 = svadd_f32_x(svptrue_b32(), v297, v299); + svfloat32_t v310 = svsub_f32_x(svptrue_b32(), v297, v299); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v299, v301); + svfloat32_t v312 = svsub_f32_x(svptrue_b32(), v301, v297); + svfloat32_t v391 = svadd_f32_x(svptrue_b32(), v385, v387); + svfloat32_t v394 = svsub_f32_x(svptrue_b32(), v385, v387); + svfloat32_t v395 = svsub_f32_x(svptrue_b32(), v387, v389); + svfloat32_t v396 = svsub_f32_x(svptrue_b32(), v389, v385); + svfloat32_t v397 = svadd_f32_x(svptrue_b32(), v386, v388); + svfloat32_t v399 = svsub_f32_x(svptrue_b32(), v386, v388); + svfloat32_t v400 = svsub_f32_x(svptrue_b32(), v388, v390); + svfloat32_t v401 = svsub_f32_x(svptrue_b32(), v390, v386); + svfloat32_t v213 = svadd_f32_x(svptrue_b32(), v207, v209); + svfloat32_t v216 = svsub_f32_x(svptrue_b32(), v207, v209); + svfloat32_t v217 = svsub_f32_x(svptrue_b32(), v209, v211); + svfloat32_t v218 = svsub_f32_x(svptrue_b32(), v211, v207); + svfloat32_t v219 = svadd_f32_x(svptrue_b32(), v208, v210); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v208, v210); + svfloat32_t v222 = svsub_f32_x(svptrue_b32(), v210, v212); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v212, v208); + svfloat32_t v303 = svadd_f32_x(svptrue_b32(), v302, v300); + svfloat32_t v309 = svadd_f32_x(svptrue_b32(), v308, v301); + svfloat32_t zero351 = svdup_n_f32(0); + svfloat32_t v351 = svcmla_f32_x(pred_full, zero351, v849, v310, 90); + svfloat32_t zero358 = svdup_n_f32(0); + svfloat32_t v358 = svcmla_f32_x(pred_full, zero358, v850, v311, 90); + svfloat32_t zero365 = svdup_n_f32(0); + svfloat32_t v365 = svcmla_f32_x(pred_full, zero365, v851, v312, 90); + svfloat32_t v392 = svadd_f32_x(svptrue_b32(), v391, v389); + svfloat32_t v398 = svadd_f32_x(svptrue_b32(), v397, v390); + svfloat32_t zero422 = svdup_n_f32(0); + svfloat32_t v422 = svcmla_f32_x(pred_full, zero422, v854, v394, 90); + svfloat32_t zero429 = svdup_n_f32(0); + svfloat32_t v429 = svcmla_f32_x(pred_full, zero429, v855, v395, 90); + svfloat32_t zero436 = svdup_n_f32(0); + svfloat32_t v436 = svcmla_f32_x(pred_full, zero436, v856, v396, 90); + svfloat32_t v446 = svmul_f32_x(svptrue_b32(), v399, v858); + svfloat32_t v451 = svmul_f32_x(svptrue_b32(), v400, v859); + svfloat32_t v214 = svadd_f32_x(svptrue_b32(), v213, v211); + svfloat32_t v220 = svadd_f32_x(svptrue_b32(), v219, v212); + svfloat32_t zero262 = svdup_n_f32(0); + svfloat32_t v262 = svcmla_f32_x(pred_full, zero262, v840, v221, 90); + svfloat32_t zero269 = svdup_n_f32(0); + svfloat32_t v269 = svcmla_f32_x(pred_full, zero269, v841, v222, 90); + svfloat32_t zero276 = svdup_n_f32(0); + svfloat32_t v276 = svcmla_f32_x(pred_full, zero276, v842, v223, 90); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v303, v34); + svfloat32_t v322 = svmul_f32_x(svptrue_b32(), v303, v844); + svfloat32_t zero344 = svdup_n_f32(0); + svfloat32_t v344 = svcmla_f32_x(pred_full, zero344, v848, v309, 90); + svfloat32_t v393 = svadd_f32_x(svptrue_b32(), v392, v35); + svfloat32_t v215 = svadd_f32_x(svptrue_b32(), v214, v44); + svfloat32_t zero255 = svdup_n_f32(0); + svfloat32_t v255 = svcmla_f32_x(pred_full, zero255, v839, v220, 90); + svfloat32_t v373 = svadd_f32_x(svptrue_b32(), v344, v351); + svfloat32_t v375 = svsub_f32_x(svptrue_b32(), v344, v351); + svfloat32_t v377 = svsub_f32_x(svptrue_b32(), v344, v358); + svfloat32_t zero408 = svdup_n_f32(0); + svfloat32_t v408 = svcmla_f32_x(pred_full, zero408, v852, v393, 90); + svfloat32_t v464 = svmla_f32_x(pred_full, v446, v398, v857); + svfloat32_t v466 = svnmls_f32_x(pred_full, v446, v398, v857); + svfloat32_t v468 = svnmls_f32_x(pred_full, v451, v398, v857); + svfloat32_t v277 = svmla_f32_x(pred_full, v215, v214, v835); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v255, v262); + svfloat32_t v286 = svsub_f32_x(svptrue_b32(), v255, v262); + svfloat32_t v288 = svsub_f32_x(svptrue_b32(), v255, v269); + svfloat32_t v366 = svmla_f32_x(pred_full, v322, v304, v843); + svfloat32_t v374 = svadd_f32_x(svptrue_b32(), v373, v358); + svfloat32_t v376 = svsub_f32_x(svptrue_b32(), v375, v365); + svfloat32_t v378 = svadd_f32_x(svptrue_b32(), v377, v365); + svfloat32_t v457 = svcmla_f32_x(pred_full, v408, v853, v392, 90); + svfloat32_t v465 = svmla_f32_x(pred_full, v464, v400, v859); + svfloat32_t v467 = svmls_f32_x(pred_full, v466, v401, v860); + svfloat32_t v469 = svmla_f32_x(pred_full, v468, v401, v860); + svfloat32_t v476 = svmla_f32_x(pred_full, v215, v304, v843); + svst1_f64(pred_full, (double *)(v868), svreinterpret_f64_f32(v215)); + svfloat32_t v278 = svmla_f32_x(pred_full, v277, v216, v836); + svfloat32_t v280 = svmls_f32_x(pred_full, v277, v216, v836); + svfloat32_t v282 = svmls_f32_x(pred_full, v277, v217, v837); + svfloat32_t v285 = svadd_f32_x(svptrue_b32(), v284, v269); + svfloat32_t v287 = svsub_f32_x(svptrue_b32(), v286, v276); + svfloat32_t v289 = svadd_f32_x(svptrue_b32(), v288, v276); + svfloat32_t v367 = svmla_f32_x(pred_full, v366, v305, v845); + svfloat32_t v369 = svmls_f32_x(pred_full, v366, v305, v845); + svfloat32_t v371 = svmls_f32_x(pred_full, v366, v306, v846); + svfloat32_t v458 = svadd_f32_x(svptrue_b32(), v457, v422); + svfloat32_t v460 = svsub_f32_x(svptrue_b32(), v457, v422); + svfloat32_t v462 = svsub_f32_x(svptrue_b32(), v457, v429); + svfloat32_t v477 = svadd_f32_x(svptrue_b32(), v476, v408); + svfloat32_t v478 = svsub_f32_x(svptrue_b32(), v476, v408); + svfloat32_t v279 = svmla_f32_x(pred_full, v278, v217, v837); + svfloat32_t v281 = svmls_f32_x(pred_full, v280, v218, v838); + svfloat32_t v283 = svmla_f32_x(pred_full, v282, v218, v838); + svfloat32_t v368 = svmla_f32_x(pred_full, v367, v306, v846); + svfloat32_t v370 = svmls_f32_x(pred_full, v369, v307, v847); + svfloat32_t v372 = svmla_f32_x(pred_full, v371, v307, v847); + svfloat32_t v459 = svadd_f32_x(svptrue_b32(), v458, v429); + svfloat32_t v461 = svsub_f32_x(svptrue_b32(), v460, v436); + svfloat32_t v463 = svadd_f32_x(svptrue_b32(), v462, v436); + svst1_f64(pred_full, (double *)(v877), svreinterpret_f64_f32(v478)); + svst1_f64(pred_full, (double *)(v886), svreinterpret_f64_f32(v477)); + svfloat32_t v290 = svadd_f32_x(svptrue_b32(), v279, v285); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v279, v285); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v281, v287); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v281, v287); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v283, v289); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v283, v289); + svfloat32_t v379 = svadd_f32_x(svptrue_b32(), v368, v374); + svfloat32_t v380 = svsub_f32_x(svptrue_b32(), v368, v374); + svfloat32_t v381 = svadd_f32_x(svptrue_b32(), v370, v376); + svfloat32_t v382 = svsub_f32_x(svptrue_b32(), v370, v376); + svfloat32_t v383 = svadd_f32_x(svptrue_b32(), v372, v378); + svfloat32_t v384 = svsub_f32_x(svptrue_b32(), v372, v378); + svfloat32_t v470 = svadd_f32_x(svptrue_b32(), v459, v465); + svfloat32_t v471 = svsub_f32_x(svptrue_b32(), v459, v465); + svfloat32_t v472 = svadd_f32_x(svptrue_b32(), v461, v467); + svfloat32_t v473 = svsub_f32_x(svptrue_b32(), v461, v467); + svfloat32_t v474 = svadd_f32_x(svptrue_b32(), v463, v469); + svfloat32_t v475 = svsub_f32_x(svptrue_b32(), v463, v469); + svfloat32_t v500 = svadd_f32_x(svptrue_b32(), v291, v380); + svfloat32_t v524 = svadd_f32_x(svptrue_b32(), v293, v382); + svfloat32_t v548 = svadd_f32_x(svptrue_b32(), v294, v383); + svfloat32_t v572 = svadd_f32_x(svptrue_b32(), v295, v384); + svfloat32_t v596 = svadd_f32_x(svptrue_b32(), v292, v381); + svfloat32_t v620 = svadd_f32_x(svptrue_b32(), v290, v379); + svst1_f64(pred_full, (double *)(v895), svreinterpret_f64_f32(v291)); + svst1_f64(pred_full, (double *)(v922), svreinterpret_f64_f32(v293)); + svst1_f64(pred_full, (double *)(v949), svreinterpret_f64_f32(v294)); + svst1_f64(pred_full, (double *)(v976), svreinterpret_f64_f32(v295)); + svst1_f64(pred_full, (double *)(v1003), svreinterpret_f64_f32(v292)); + svst1_f64(pred_full, (double *)(v1030), svreinterpret_f64_f32(v290)); + svfloat32_t v501 = svadd_f32_x(svptrue_b32(), v500, v471); + svfloat32_t v502 = svsub_f32_x(svptrue_b32(), v500, v471); + svfloat32_t v525 = svadd_f32_x(svptrue_b32(), v524, v473); + svfloat32_t v526 = svsub_f32_x(svptrue_b32(), v524, v473); + svfloat32_t v549 = svadd_f32_x(svptrue_b32(), v548, v474); + svfloat32_t v550 = svsub_f32_x(svptrue_b32(), v548, v474); + svfloat32_t v573 = svadd_f32_x(svptrue_b32(), v572, v475); + svfloat32_t v574 = svsub_f32_x(svptrue_b32(), v572, v475); + svfloat32_t v597 = svadd_f32_x(svptrue_b32(), v596, v472); + svfloat32_t v598 = svsub_f32_x(svptrue_b32(), v596, v472); + svfloat32_t v621 = svadd_f32_x(svptrue_b32(), v620, v470); + svfloat32_t v622 = svsub_f32_x(svptrue_b32(), v620, v470); + svst1_f64(pred_full, (double *)(v904), svreinterpret_f64_f32(v502)); + svst1_f64(pred_full, (double *)(v913), svreinterpret_f64_f32(v501)); + svst1_f64(pred_full, (double *)(v931), svreinterpret_f64_f32(v526)); + svst1_f64(pred_full, (double *)(v940), svreinterpret_f64_f32(v525)); + svst1_f64(pred_full, (double *)(v958), svreinterpret_f64_f32(v550)); + svst1_f64(pred_full, (double *)(v967), svreinterpret_f64_f32(v549)); + svst1_f64(pred_full, (double *)(v985), svreinterpret_f64_f32(v574)); + svst1_f64(pred_full, (double *)(v994), svreinterpret_f64_f32(v573)); + svst1_f64(pred_full, (double *)(v1012), svreinterpret_f64_f32(v598)); + svst1_f64(pred_full, (double *)(v1021), svreinterpret_f64_f32(v597)); + svst1_f64(pred_full, (double *)(v1039), svreinterpret_f64_f32(v622)); + svst1_f64(pred_full, (double *)(v1048), svreinterpret_f64_f32(v621)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu22(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v110 = vld1s_s16(&v5[istride]); + float v410 = 1.1000000000000001e+00F; + float v413 = 3.3166247903554003e-01F; + float v414 = -3.3166247903554003e-01F; + float v421 = 5.1541501300188641e-01F; + float v425 = 9.4125353283118118e-01F; + float v429 = 1.4143537075597825e+00F; + float v433 = 8.5949297361449750e-01F; + float v437 = 4.2314838273285138e-02F; + float v441 = 3.8639279888589606e-01F; + float v445 = 5.1254589567200015e-01F; + float v449 = 1.0702757469471715e+00F; + float v453 = 5.5486073394528512e-01F; + float v456 = 1.2412944743900585e+00F; + float v457 = -1.2412944743900585e+00F; + float v463 = 2.0897833842005756e-01F; + float v464 = -2.0897833842005756e-01F; + float v470 = 3.7415717312460811e-01F; + float v471 = -3.7415717312460811e-01F; + float v477 = 4.9929922194110327e-02F; + float v478 = -4.9929922194110327e-02F; + float v484 = 6.5815896284539266e-01F; + float v485 = -6.5815896284539266e-01F; + float v491 = 6.3306543373877577e-01F; + float v492 = -6.3306543373877577e-01F; + float v498 = 1.0822460581641109e+00F; + float v499 = -1.0822460581641109e+00F; + float v505 = 8.1720737907134022e-01F; + float v506 = -8.1720737907134022e-01F; + float v512 = 4.2408709531871824e-01F; + float v513 = -4.2408709531871824e-01F; + float32x2_t v515 = (float32x2_t){v4, v4}; + int16x4_t v20 = vld1s_s16(&v5[0]); + float32x2_t v111 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v110)), 15); + float32x2_t v411 = (float32x2_t){v410, v410}; + float32x2_t v415 = (float32x2_t){v413, v414}; + float32x2_t v422 = (float32x2_t){v421, v421}; + float32x2_t v426 = (float32x2_t){v425, v425}; + float32x2_t v430 = (float32x2_t){v429, v429}; + float32x2_t v434 = (float32x2_t){v433, v433}; + float32x2_t v438 = (float32x2_t){v437, v437}; + float32x2_t v442 = (float32x2_t){v441, v441}; + float32x2_t v446 = (float32x2_t){v445, v445}; + float32x2_t v450 = (float32x2_t){v449, v449}; + float32x2_t v454 = (float32x2_t){v453, v453}; + float32x2_t v458 = (float32x2_t){v456, v457}; + float32x2_t v465 = (float32x2_t){v463, v464}; + float32x2_t v472 = (float32x2_t){v470, v471}; + float32x2_t v479 = (float32x2_t){v477, v478}; + float32x2_t v486 = (float32x2_t){v484, v485}; + float32x2_t v493 = (float32x2_t){v491, v492}; + float32x2_t v500 = (float32x2_t){v498, v499}; + float32x2_t v507 = (float32x2_t){v505, v506}; + float32x2_t v514 = (float32x2_t){v512, v513}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v26 = vld1s_s16(&v5[istride * 11]); + int16x4_t v34 = vld1s_s16(&v5[istride * 2]); + int16x4_t v40 = vld1s_s16(&v5[istride * 13]); + int16x4_t v48 = vld1s_s16(&v5[istride * 4]); + int16x4_t v54 = vld1s_s16(&v5[istride * 15]); + int16x4_t v62 = vld1s_s16(&v5[istride * 6]); + int16x4_t v68 = vld1s_s16(&v5[istride * 17]); + int16x4_t v76 = vld1s_s16(&v5[istride * 8]); + int16x4_t v82 = vld1s_s16(&v5[istride * 19]); + int16x4_t v90 = vld1s_s16(&v5[istride * 10]); + int16x4_t v96 = vld1s_s16(&v5[istride * 21]); + int16x4_t v104 = vld1s_s16(&v5[istride * 12]); + int16x4_t v118 = vld1s_s16(&v5[istride * 14]); + int16x4_t v124 = vld1s_s16(&v5[istride * 3]); + int16x4_t v132 = vld1s_s16(&v5[istride * 16]); + int16x4_t v138 = vld1s_s16(&v5[istride * 5]); + int16x4_t v146 = vld1s_s16(&v5[istride * 18]); + int16x4_t v152 = vld1s_s16(&v5[istride * 7]); + int16x4_t v160 = vld1s_s16(&v5[istride * 20]); + int16x4_t v166 = vld1s_s16(&v5[istride * 9]); + float32x2_t v417 = vmul_f32(v515, v415); + float32x2_t v460 = vmul_f32(v515, v458); + float32x2_t v467 = vmul_f32(v515, v465); + float32x2_t v474 = vmul_f32(v515, v472); + float32x2_t v481 = vmul_f32(v515, v479); + float32x2_t v488 = vmul_f32(v515, v486); + float32x2_t v495 = vmul_f32(v515, v493); + float32x2_t v502 = vmul_f32(v515, v500); + float32x2_t v509 = vmul_f32(v515, v507); + float32x2_t v516 = vmul_f32(v515, v514); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v49 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v48)), 15); + float32x2_t v55 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v54)), 15); + float32x2_t v63 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v62)), 15); + float32x2_t v69 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v68)), 15); + float32x2_t v77 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v76)), 15); + float32x2_t v83 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v82)), 15); + float32x2_t v91 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v90)), 15); + float32x2_t v97 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v96)), 15); + float32x2_t v105 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v104)), 15); + float32x2_t v119 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v118)), 15); + float32x2_t v125 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v124)), 15); + float32x2_t v133 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v132)), 15); + float32x2_t v139 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v138)), 15); + float32x2_t v147 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v146)), 15); + float32x2_t v153 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v152)), 15); + float32x2_t v161 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v160)), 15); + float32x2_t v167 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v166)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v56 = vadd_f32(v49, v55); + float32x2_t v57 = vsub_f32(v49, v55); + float32x2_t v70 = vadd_f32(v63, v69); + float32x2_t v71 = vsub_f32(v63, v69); + float32x2_t v84 = vadd_f32(v77, v83); + float32x2_t v85 = vsub_f32(v77, v83); + float32x2_t v98 = vadd_f32(v91, v97); + float32x2_t v99 = vsub_f32(v91, v97); + float32x2_t v112 = vadd_f32(v105, v111); + float32x2_t v113 = vsub_f32(v105, v111); + float32x2_t v126 = vadd_f32(v119, v125); + float32x2_t v127 = vsub_f32(v119, v125); + float32x2_t v140 = vadd_f32(v133, v139); + float32x2_t v141 = vsub_f32(v133, v139); + float32x2_t v154 = vadd_f32(v147, v153); + float32x2_t v155 = vsub_f32(v147, v153); + float32x2_t v168 = vadd_f32(v161, v167); + float32x2_t v169 = vsub_f32(v161, v167); + float32x2_t v170 = vadd_f32(v42, v168); + float32x2_t v171 = vadd_f32(v56, v154); + float32x2_t v172 = vadd_f32(v70, v140); + float32x2_t v173 = vadd_f32(v84, v126); + float32x2_t v174 = vadd_f32(v98, v112); + float32x2_t v175 = vsub_f32(v42, v168); + float32x2_t v176 = vsub_f32(v56, v154); + float32x2_t v177 = vsub_f32(v70, v140); + float32x2_t v178 = vsub_f32(v84, v126); + float32x2_t v179 = vsub_f32(v98, v112); + float32x2_t v368 = vadd_f32(v43, v169); + float32x2_t v369 = vadd_f32(v57, v155); + float32x2_t v370 = vadd_f32(v71, v141); + float32x2_t v371 = vadd_f32(v85, v127); + float32x2_t v372 = vadd_f32(v99, v113); + float32x2_t v373 = vsub_f32(v43, v169); + float32x2_t v374 = vsub_f32(v57, v155); + float32x2_t v375 = vsub_f32(v71, v141); + float32x2_t v376 = vsub_f32(v85, v127); + float32x2_t v377 = vsub_f32(v99, v113); + float32x2_t v180 = vadd_f32(v170, v171); + float32x2_t v181 = vadd_f32(v172, v174); + float32x2_t v183 = vsub_f32(v176, v177); + float32x2_t v184 = vadd_f32(v175, v179); + float32x2_t v189 = vsub_f32(v171, v173); + float32x2_t v190 = vsub_f32(v170, v173); + float32x2_t v191 = vsub_f32(v171, v170); + float32x2_t v192 = vsub_f32(v174, v173); + float32x2_t v193 = vsub_f32(v172, v173); + float32x2_t v194 = vsub_f32(v174, v172); + float32x2_t v195 = vsub_f32(v171, v174); + float32x2_t v196 = vsub_f32(v170, v172); + float32x2_t v198 = vadd_f32(v176, v178); + float32x2_t v199 = vsub_f32(v175, v178); + float32x2_t v200 = vadd_f32(v175, v176); + float32x2_t v201 = vsub_f32(v178, v179); + float32x2_t v202 = vsub_f32(v177, v178); + float32x2_t v203 = vsub_f32(v177, v179); + float32x2_t v204 = vadd_f32(v176, v179); + float32x2_t v205 = vsub_f32(v175, v177); + float32x2_t v378 = vadd_f32(v368, v369); + float32x2_t v379 = vadd_f32(v370, v372); + float32x2_t v381 = vsub_f32(v374, v375); + float32x2_t v382 = vadd_f32(v373, v377); + float32x2_t v387 = vsub_f32(v369, v371); + float32x2_t v388 = vsub_f32(v368, v371); + float32x2_t v389 = vsub_f32(v369, v368); + float32x2_t v390 = vsub_f32(v372, v371); + float32x2_t v391 = vsub_f32(v370, v371); + float32x2_t v392 = vsub_f32(v372, v370); + float32x2_t v393 = vsub_f32(v369, v372); + float32x2_t v394 = vsub_f32(v368, v370); + float32x2_t v396 = vadd_f32(v374, v376); + float32x2_t v397 = vsub_f32(v373, v376); + float32x2_t v398 = vadd_f32(v373, v374); + float32x2_t v399 = vsub_f32(v376, v377); + float32x2_t v400 = vsub_f32(v375, v376); + float32x2_t v401 = vsub_f32(v375, v377); + float32x2_t v402 = vadd_f32(v374, v377); + float32x2_t v403 = vsub_f32(v373, v375); + float32x2_t v182 = vadd_f32(v173, v180); + float32x2_t v187 = vsub_f32(v183, v184); + float32x2_t v197 = vsub_f32(v181, v180); + float32x2_t v206 = vadd_f32(v183, v184); + float32x2_t v225 = vmul_f32(v189, v422); + float32x2_t v229 = vmul_f32(v190, v426); + float32x2_t v233 = vmul_f32(v191, v430); + float32x2_t v237 = vmul_f32(v192, v434); + float32x2_t v241 = vmul_f32(v193, v438); + float32x2_t v245 = vmul_f32(v194, v442); + float32x2_t v249 = vmul_f32(v195, v446); + float32x2_t v253 = vmul_f32(v196, v450); + float32x2_t v263 = vrev64_f32(v198); + float32x2_t v270 = vrev64_f32(v199); + float32x2_t v277 = vrev64_f32(v200); + float32x2_t v284 = vrev64_f32(v201); + float32x2_t v291 = vrev64_f32(v202); + float32x2_t v298 = vrev64_f32(v203); + float32x2_t v305 = vrev64_f32(v204); + float32x2_t v312 = vrev64_f32(v205); + float32x2_t v380 = vadd_f32(v371, v378); + float32x2_t v385 = vsub_f32(v381, v382); + float32x2_t v395 = vsub_f32(v379, v378); + float32x2_t v404 = vadd_f32(v381, v382); + float32x2_t v423 = vmul_f32(v387, v422); + float32x2_t v427 = vmul_f32(v388, v426); + float32x2_t v431 = vmul_f32(v389, v430); + float32x2_t v435 = vmul_f32(v390, v434); + float32x2_t v439 = vmul_f32(v391, v438); + float32x2_t v443 = vmul_f32(v392, v442); + float32x2_t v447 = vmul_f32(v393, v446); + float32x2_t v451 = vmul_f32(v394, v450); + float32x2_t v461 = vrev64_f32(v396); + float32x2_t v468 = vrev64_f32(v397); + float32x2_t v475 = vrev64_f32(v398); + float32x2_t v482 = vrev64_f32(v399); + float32x2_t v489 = vrev64_f32(v400); + float32x2_t v496 = vrev64_f32(v401); + float32x2_t v503 = vrev64_f32(v402); + float32x2_t v510 = vrev64_f32(v403); + float32x2_t v185 = vadd_f32(v182, v181); + float32x2_t v188 = vsub_f32(v187, v178); + float32x2_t v257 = vmul_f32(v197, v454); + float32x2_t v264 = vmul_f32(v263, v460); + float32x2_t v271 = vmul_f32(v270, v467); + float32x2_t v278 = vmul_f32(v277, v474); + float32x2_t v285 = vmul_f32(v284, v481); + float32x2_t v292 = vmul_f32(v291, v488); + float32x2_t v299 = vmul_f32(v298, v495); + float32x2_t v306 = vmul_f32(v305, v502); + float32x2_t v313 = vmul_f32(v312, v509); + float32x2_t v319 = vrev64_f32(v206); + float32x2_t v322 = vadd_f32(v225, v229); + float32x2_t v323 = vadd_f32(v229, v233); + float32x2_t v324 = vsub_f32(v225, v233); + float32x2_t v325 = vadd_f32(v237, v241); + float32x2_t v326 = vadd_f32(v241, v245); + float32x2_t v327 = vsub_f32(v237, v245); + float32x2_t v383 = vadd_f32(v380, v379); + float32x2_t v386 = vsub_f32(v385, v376); + float32x2_t v455 = vmul_f32(v395, v454); + float32x2_t v462 = vmul_f32(v461, v460); + float32x2_t v469 = vmul_f32(v468, v467); + float32x2_t v476 = vmul_f32(v475, v474); + float32x2_t v483 = vmul_f32(v482, v481); + float32x2_t v490 = vmul_f32(v489, v488); + float32x2_t v497 = vmul_f32(v496, v495); + float32x2_t v504 = vmul_f32(v503, v502); + float32x2_t v511 = vmul_f32(v510, v509); + float32x2_t v517 = vrev64_f32(v404); + float32x2_t v520 = vadd_f32(v423, v427); + float32x2_t v521 = vadd_f32(v427, v431); + float32x2_t v522 = vsub_f32(v423, v431); + float32x2_t v523 = vadd_f32(v435, v439); + float32x2_t v524 = vadd_f32(v439, v443); + float32x2_t v525 = vsub_f32(v435, v443); + float32x2_t v186 = vadd_f32(v28, v185); + float32x2_t v214 = vmul_f32(v185, v411); + float32x2_t v220 = vrev64_f32(v188); + float32x2_t v320 = vmul_f32(v319, v516); + float32x2_t v328 = vadd_f32(v253, v257); + float32x2_t v329 = vadd_f32(v249, v257); + float32x2_t v330 = vadd_f32(v271, v278); + float32x2_t v331 = vsub_f32(v264, v278); + float32x2_t v332 = vadd_f32(v292, v299); + float32x2_t v333 = vsub_f32(v285, v299); + float32x2_t v384 = vadd_f32(v29, v383); + float32x2_t v412 = vmul_f32(v383, v411); + float32x2_t v418 = vrev64_f32(v386); + float32x2_t v518 = vmul_f32(v517, v516); + float32x2_t v526 = vadd_f32(v451, v455); + float32x2_t v527 = vadd_f32(v447, v455); + float32x2_t v528 = vadd_f32(v469, v476); + float32x2_t v529 = vsub_f32(v462, v476); + float32x2_t v530 = vadd_f32(v490, v497); + float32x2_t v531 = vsub_f32(v483, v497); + float32x2_t v221 = vmul_f32(v220, v417); + float32x2_t v321 = vsub_f32(v186, v214); + float32x2_t v334 = vadd_f32(v313, v320); + float32x2_t v335 = vsub_f32(v306, v320); + float32x2_t v336 = vadd_f32(v326, v328); + float32x2_t v354 = vadd_f32(v330, v331); + float32x2_t v419 = vmul_f32(v418, v417); + float32x2_t v519 = vsub_f32(v384, v412); + float32x2_t v532 = vadd_f32(v511, v518); + float32x2_t v533 = vsub_f32(v504, v518); + float32x2_t v534 = vadd_f32(v524, v526); + float32x2_t v552 = vadd_f32(v528, v529); + v6[0] = v186; + v6[ostride * 11] = v384; + float32x2_t v337 = vadd_f32(v336, v321); + float32x2_t v338 = vsub_f32(v321, v323); + float32x2_t v340 = vadd_f32(v321, v327); + float32x2_t v342 = vsub_f32(v321, v324); + float32x2_t v344 = vadd_f32(v321, v322); + float32x2_t v346 = vadd_f32(v221, v332); + float32x2_t v348 = vsub_f32(v334, v330); + float32x2_t v350 = vadd_f32(v221, v335); + float32x2_t v352 = vsub_f32(v335, v331); + float32x2_t v355 = vadd_f32(v354, v332); + float32x2_t v535 = vadd_f32(v534, v519); + float32x2_t v536 = vsub_f32(v519, v521); + float32x2_t v538 = vadd_f32(v519, v525); + float32x2_t v540 = vsub_f32(v519, v522); + float32x2_t v542 = vadd_f32(v519, v520); + float32x2_t v544 = vadd_f32(v419, v530); + float32x2_t v546 = vsub_f32(v532, v528); + float32x2_t v548 = vadd_f32(v419, v533); + float32x2_t v550 = vsub_f32(v533, v529); + float32x2_t v553 = vadd_f32(v552, v530); + float32x2_t v339 = vsub_f32(v338, v328); + float32x2_t v341 = vadd_f32(v340, v329); + float32x2_t v343 = vsub_f32(v342, v329); + float32x2_t v345 = vsub_f32(v344, v325); + float32x2_t v347 = vadd_f32(v346, v334); + float32x2_t v349 = vsub_f32(v348, v221); + float32x2_t v351 = vadd_f32(v350, v333); + float32x2_t v353 = vsub_f32(v352, v221); + float32x2_t v356 = vadd_f32(v355, v333); + float32x2_t v537 = vsub_f32(v536, v526); + float32x2_t v539 = vadd_f32(v538, v527); + float32x2_t v541 = vsub_f32(v540, v527); + float32x2_t v543 = vsub_f32(v542, v523); + float32x2_t v545 = vadd_f32(v544, v532); + float32x2_t v547 = vsub_f32(v546, v419); + float32x2_t v549 = vadd_f32(v548, v531); + float32x2_t v551 = vsub_f32(v550, v419); + float32x2_t v554 = vadd_f32(v553, v531); + float32x2_t v357 = vsub_f32(v356, v221); + float32x2_t v359 = vadd_f32(v337, v347); + float32x2_t v360 = vadd_f32(v339, v349); + float32x2_t v361 = vsub_f32(v341, v351); + float32x2_t v362 = vadd_f32(v343, v353); + float32x2_t v363 = vsub_f32(v343, v353); + float32x2_t v364 = vadd_f32(v341, v351); + float32x2_t v365 = vsub_f32(v339, v349); + float32x2_t v366 = vsub_f32(v337, v347); + float32x2_t v555 = vsub_f32(v554, v419); + float32x2_t v557 = vadd_f32(v535, v545); + float32x2_t v558 = vadd_f32(v537, v547); + float32x2_t v559 = vsub_f32(v539, v549); + float32x2_t v560 = vadd_f32(v541, v551); + float32x2_t v561 = vsub_f32(v541, v551); + float32x2_t v562 = vadd_f32(v539, v549); + float32x2_t v563 = vsub_f32(v537, v547); + float32x2_t v564 = vsub_f32(v535, v545); + float32x2_t v358 = vadd_f32(v345, v357); + float32x2_t v367 = vsub_f32(v345, v357); + float32x2_t v556 = vadd_f32(v543, v555); + float32x2_t v565 = vsub_f32(v543, v555); + v6[ostride * 2] = v366; + v6[ostride * 13] = v564; + v6[ostride * 14] = v365; + v6[ostride * 3] = v563; + v6[ostride * 4] = v364; + v6[ostride * 15] = v562; + v6[ostride * 16] = v363; + v6[ostride * 5] = v561; + v6[ostride * 6] = v362; + v6[ostride * 17] = v560; + v6[ostride * 18] = v361; + v6[ostride * 7] = v559; + v6[ostride * 8] = v360; + v6[ostride * 19] = v558; + v6[ostride * 20] = v359; + v6[ostride * 9] = v557; + v6[ostride * 12] = v367; + v6[ostride] = v565; + v6[ostride * 10] = v358; + v6[ostride * 21] = v556; + v5 += 1 * idist; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu22(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v468 = 1.1000000000000001e+00F; + float v473 = -3.3166247903554003e-01F; + float v480 = 5.1541501300188641e-01F; + float v485 = 9.4125353283118118e-01F; + float v490 = 1.4143537075597825e+00F; + float v495 = 8.5949297361449750e-01F; + float v500 = 4.2314838273285138e-02F; + float v505 = 3.8639279888589606e-01F; + float v510 = 5.1254589567200015e-01F; + float v515 = 1.0702757469471715e+00F; + float v520 = 5.5486073394528512e-01F; + float v525 = -1.2412944743900585e+00F; + float v532 = -2.0897833842005756e-01F; + float v539 = -3.7415717312460811e-01F; + float v546 = -4.9929922194110327e-02F; + float v553 = -6.5815896284539266e-01F; + float v560 = -6.3306543373877577e-01F; + float v567 = -1.0822460581641109e+00F; + float v574 = -8.1720737907134022e-01F; + float v581 = -4.2408709531871824e-01F; + const int32_t *v912 = &v5[v0]; + float32x2_t *v1063 = &v6[v2]; + int64_t v27 = v0 * 11; + int64_t v37 = v0 * 2; + int64_t v45 = v0 * 13; + int64_t v55 = v0 * 4; + int64_t v63 = v0 * 15; + int64_t v73 = v0 * 6; + int64_t v81 = v0 * 17; + int64_t v91 = v0 * 8; + int64_t v99 = v0 * 19; + int64_t v109 = v0 * 10; + int64_t v117 = v0 * 21; + int64_t v127 = v0 * 12; + int64_t v145 = v0 * 14; + int64_t v153 = v0 * 3; + int64_t v163 = v0 * 16; + int64_t v171 = v0 * 5; + int64_t v181 = v0 * 18; + int64_t v189 = v0 * 7; + int64_t v199 = v0 * 20; + int64_t v207 = v0 * 9; + float v476 = v4 * v473; + float v528 = v4 * v525; + float v535 = v4 * v532; + float v542 = v4 * v539; + float v549 = v4 * v546; + float v556 = v4 * v553; + float v563 = v4 * v560; + float v570 = v4 * v567; + float v577 = v4 * v574; + float v584 = v4 * v581; + int64_t v642 = v2 * 11; + int64_t v649 = v2 * 12; + int64_t v663 = v2 * 2; + int64_t v670 = v2 * 13; + int64_t v677 = v2 * 14; + int64_t v684 = v2 * 3; + int64_t v691 = v2 * 4; + int64_t v698 = v2 * 15; + int64_t v705 = v2 * 16; + int64_t v712 = v2 * 5; + int64_t v719 = v2 * 6; + int64_t v726 = v2 * 17; + int64_t v733 = v2 * 18; + int64_t v740 = v2 * 7; + int64_t v747 = v2 * 8; + int64_t v754 = v2 * 19; + int64_t v761 = v2 * 20; + int64_t v768 = v2 * 9; + int64_t v775 = v2 * 10; + int64_t v782 = v2 * 21; + const int32_t *v795 = &v5[0]; + svint64_t v985 = svindex_s64(0, v1); + svfloat32_t v1009 = svdup_n_f32(v468); + svfloat32_t v1011 = svdup_n_f32(v480); + svfloat32_t v1012 = svdup_n_f32(v485); + svfloat32_t v1013 = svdup_n_f32(v490); + svfloat32_t v1014 = svdup_n_f32(v495); + svfloat32_t v1015 = svdup_n_f32(v500); + svfloat32_t v1016 = svdup_n_f32(v505); + svfloat32_t v1017 = svdup_n_f32(v510); + svfloat32_t v1018 = svdup_n_f32(v515); + svfloat32_t v1019 = svdup_n_f32(v520); + float32x2_t *v1036 = &v6[0]; + svint16_t v797 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v795), v985)); + const int32_t *v804 = &v5[v27]; + const int32_t *v813 = &v5[v37]; + const int32_t *v822 = &v5[v45]; + const int32_t *v831 = &v5[v55]; + const int32_t *v840 = &v5[v63]; + const int32_t *v849 = &v5[v73]; + const int32_t *v858 = &v5[v81]; + const int32_t *v867 = &v5[v91]; + const int32_t *v876 = &v5[v99]; + const int32_t *v885 = &v5[v109]; + const int32_t *v894 = &v5[v117]; + const int32_t *v903 = &v5[v127]; + svint16_t v914 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v912), v985)); + const int32_t *v921 = &v5[v145]; + const int32_t *v930 = &v5[v153]; + const int32_t *v939 = &v5[v163]; + const int32_t *v948 = &v5[v171]; + const int32_t *v957 = &v5[v181]; + const int32_t *v966 = &v5[v189]; + const int32_t *v975 = &v5[v199]; + const int32_t *v984 = &v5[v207]; + svfloat32_t v1010 = svdup_n_f32(v476); + svfloat32_t v1020 = svdup_n_f32(v528); + svfloat32_t v1021 = svdup_n_f32(v535); + svfloat32_t v1022 = svdup_n_f32(v542); + svfloat32_t v1023 = svdup_n_f32(v549); + svfloat32_t v1024 = svdup_n_f32(v556); + svfloat32_t v1025 = svdup_n_f32(v563); + svfloat32_t v1026 = svdup_n_f32(v570); + svfloat32_t v1027 = svdup_n_f32(v577); + svfloat32_t v1028 = svdup_n_f32(v584); + float32x2_t *v1045 = &v6[v642]; + float32x2_t *v1054 = &v6[v649]; + float32x2_t *v1072 = &v6[v663]; + float32x2_t *v1081 = &v6[v670]; + float32x2_t *v1090 = &v6[v677]; + float32x2_t *v1099 = &v6[v684]; + float32x2_t *v1108 = &v6[v691]; + float32x2_t *v1117 = &v6[v698]; + float32x2_t *v1126 = &v6[v705]; + float32x2_t *v1135 = &v6[v712]; + float32x2_t *v1144 = &v6[v719]; + float32x2_t *v1153 = &v6[v726]; + float32x2_t *v1162 = &v6[v733]; + float32x2_t *v1171 = &v6[v740]; + float32x2_t *v1180 = &v6[v747]; + float32x2_t *v1189 = &v6[v754]; + float32x2_t *v1198 = &v6[v761]; + float32x2_t *v1207 = &v6[v768]; + float32x2_t *v1216 = &v6[v775]; + float32x2_t *v1225 = &v6[v782]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v797, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v141 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v914, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svint16_t v806 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v804), v985)); + svint16_t v815 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v813), v985)); + svint16_t v824 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v822), v985)); + svint16_t v833 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v831), v985)); + svint16_t v842 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v840), v985)); + svint16_t v851 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v849), v985)); + svint16_t v860 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v858), v985)); + svint16_t v869 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v867), v985)); + svint16_t v878 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v876), v985)); + svint16_t v887 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v885), v985)); + svint16_t v896 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v894), v985)); + svint16_t v905 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v903), v985)); + svint16_t v923 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v921), v985)); + svint16_t v932 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v930), v985)); + svint16_t v941 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v939), v985)); + svint16_t v950 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v948), v985)); + svint16_t v959 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v957), v985)); + svint16_t v968 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v966), v985)); + svint16_t v977 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v975), v985)); + svint16_t v986 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v984), v985)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v806, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v815, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v51 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v824, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v61 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v833, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v69 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v842, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v79 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v851, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v87 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v860, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v97 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v869, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v105 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v878, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v115 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v887, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v123 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v896, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v133 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v905, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v151 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v923, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v159 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v932, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v169 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v941, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v177 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v950, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v187 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v959, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v195 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v968, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v205 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v977, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v213 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v986, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v70 = svadd_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v71 = svsub_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v106 = svadd_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v151, v159); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v151, v159); + svfloat32_t v178 = svadd_f32_x(svptrue_b32(), v169, v177); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v169, v177); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v187, v195); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v187, v195); + svfloat32_t v214 = svadd_f32_x(svptrue_b32(), v205, v213); + svfloat32_t v215 = svsub_f32_x(svptrue_b32(), v205, v213); + svfloat32_t v216 = svadd_f32_x(svptrue_b32(), v52, v214); + svfloat32_t v217 = svadd_f32_x(svptrue_b32(), v70, v196); + svfloat32_t v218 = svadd_f32_x(svptrue_b32(), v88, v178); + svfloat32_t v219 = svadd_f32_x(svptrue_b32(), v106, v160); + svfloat32_t v220 = svadd_f32_x(svptrue_b32(), v124, v142); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v52, v214); + svfloat32_t v222 = svsub_f32_x(svptrue_b32(), v70, v196); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v88, v178); + svfloat32_t v224 = svsub_f32_x(svptrue_b32(), v106, v160); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v124, v142); + svfloat32_t v425 = svadd_f32_x(svptrue_b32(), v53, v215); + svfloat32_t v426 = svadd_f32_x(svptrue_b32(), v71, v197); + svfloat32_t v427 = svadd_f32_x(svptrue_b32(), v89, v179); + svfloat32_t v428 = svadd_f32_x(svptrue_b32(), v107, v161); + svfloat32_t v429 = svadd_f32_x(svptrue_b32(), v125, v143); + svfloat32_t v430 = svsub_f32_x(svptrue_b32(), v53, v215); + svfloat32_t v431 = svsub_f32_x(svptrue_b32(), v71, v197); + svfloat32_t v432 = svsub_f32_x(svptrue_b32(), v89, v179); + svfloat32_t v433 = svsub_f32_x(svptrue_b32(), v107, v161); + svfloat32_t v434 = svsub_f32_x(svptrue_b32(), v125, v143); + svfloat32_t v226 = svadd_f32_x(svptrue_b32(), v216, v217); + svfloat32_t v227 = svadd_f32_x(svptrue_b32(), v218, v220); + svfloat32_t v229 = svsub_f32_x(svptrue_b32(), v222, v223); + svfloat32_t v230 = svadd_f32_x(svptrue_b32(), v221, v225); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v217, v219); + svfloat32_t v236 = svsub_f32_x(svptrue_b32(), v216, v219); + svfloat32_t v237 = svsub_f32_x(svptrue_b32(), v217, v216); + svfloat32_t v238 = svsub_f32_x(svptrue_b32(), v220, v219); + svfloat32_t v239 = svsub_f32_x(svptrue_b32(), v218, v219); + svfloat32_t v240 = svsub_f32_x(svptrue_b32(), v220, v218); + svfloat32_t v241 = svsub_f32_x(svptrue_b32(), v217, v220); + svfloat32_t v242 = svsub_f32_x(svptrue_b32(), v216, v218); + svfloat32_t v244 = svadd_f32_x(svptrue_b32(), v222, v224); + svfloat32_t v245 = svsub_f32_x(svptrue_b32(), v221, v224); + svfloat32_t v246 = svadd_f32_x(svptrue_b32(), v221, v222); + svfloat32_t v247 = svsub_f32_x(svptrue_b32(), v224, v225); + svfloat32_t v248 = svsub_f32_x(svptrue_b32(), v223, v224); + svfloat32_t v249 = svsub_f32_x(svptrue_b32(), v223, v225); + svfloat32_t v250 = svadd_f32_x(svptrue_b32(), v222, v225); + svfloat32_t v251 = svsub_f32_x(svptrue_b32(), v221, v223); + svfloat32_t v435 = svadd_f32_x(svptrue_b32(), v425, v426); + svfloat32_t v436 = svadd_f32_x(svptrue_b32(), v427, v429); + svfloat32_t v438 = svsub_f32_x(svptrue_b32(), v431, v432); + svfloat32_t v439 = svadd_f32_x(svptrue_b32(), v430, v434); + svfloat32_t v444 = svsub_f32_x(svptrue_b32(), v426, v428); + svfloat32_t v445 = svsub_f32_x(svptrue_b32(), v425, v428); + svfloat32_t v446 = svsub_f32_x(svptrue_b32(), v426, v425); + svfloat32_t v447 = svsub_f32_x(svptrue_b32(), v429, v428); + svfloat32_t v448 = svsub_f32_x(svptrue_b32(), v427, v428); + svfloat32_t v449 = svsub_f32_x(svptrue_b32(), v429, v427); + svfloat32_t v450 = svsub_f32_x(svptrue_b32(), v426, v429); + svfloat32_t v451 = svsub_f32_x(svptrue_b32(), v425, v427); + svfloat32_t v453 = svadd_f32_x(svptrue_b32(), v431, v433); + svfloat32_t v454 = svsub_f32_x(svptrue_b32(), v430, v433); + svfloat32_t v455 = svadd_f32_x(svptrue_b32(), v430, v431); + svfloat32_t v456 = svsub_f32_x(svptrue_b32(), v433, v434); + svfloat32_t v457 = svsub_f32_x(svptrue_b32(), v432, v433); + svfloat32_t v458 = svsub_f32_x(svptrue_b32(), v432, v434); + svfloat32_t v459 = svadd_f32_x(svptrue_b32(), v431, v434); + svfloat32_t v460 = svsub_f32_x(svptrue_b32(), v430, v432); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v219, v226); + svfloat32_t v233 = svsub_f32_x(svptrue_b32(), v229, v230); + svfloat32_t v243 = svsub_f32_x(svptrue_b32(), v227, v226); + svfloat32_t v252 = svadd_f32_x(svptrue_b32(), v229, v230); + svfloat32_t v279 = svmul_f32_x(svptrue_b32(), v236, v1012); + svfloat32_t v284 = svmul_f32_x(svptrue_b32(), v237, v1013); + svfloat32_t v294 = svmul_f32_x(svptrue_b32(), v239, v1015); + svfloat32_t v299 = svmul_f32_x(svptrue_b32(), v240, v1016); + svfloat32_t zero321 = svdup_n_f32(0); + svfloat32_t v321 = svcmla_f32_x(pred_full, zero321, v1020, v244, 90); + svfloat32_t zero335 = svdup_n_f32(0); + svfloat32_t v335 = svcmla_f32_x(pred_full, zero335, v1022, v246, 90); + svfloat32_t zero342 = svdup_n_f32(0); + svfloat32_t v342 = svcmla_f32_x(pred_full, zero342, v1023, v247, 90); + svfloat32_t zero356 = svdup_n_f32(0); + svfloat32_t v356 = svcmla_f32_x(pred_full, zero356, v1025, v249, 90); + svfloat32_t zero363 = svdup_n_f32(0); + svfloat32_t v363 = svcmla_f32_x(pred_full, zero363, v1026, v250, 90); + svfloat32_t v437 = svadd_f32_x(svptrue_b32(), v428, v435); + svfloat32_t v442 = svsub_f32_x(svptrue_b32(), v438, v439); + svfloat32_t v452 = svsub_f32_x(svptrue_b32(), v436, v435); + svfloat32_t v461 = svadd_f32_x(svptrue_b32(), v438, v439); + svfloat32_t v488 = svmul_f32_x(svptrue_b32(), v445, v1012); + svfloat32_t v493 = svmul_f32_x(svptrue_b32(), v446, v1013); + svfloat32_t v503 = svmul_f32_x(svptrue_b32(), v448, v1015); + svfloat32_t v508 = svmul_f32_x(svptrue_b32(), v449, v1016); + svfloat32_t zero530 = svdup_n_f32(0); + svfloat32_t v530 = svcmla_f32_x(pred_full, zero530, v1020, v453, 90); + svfloat32_t zero544 = svdup_n_f32(0); + svfloat32_t v544 = svcmla_f32_x(pred_full, zero544, v1022, v455, 90); + svfloat32_t zero551 = svdup_n_f32(0); + svfloat32_t v551 = svcmla_f32_x(pred_full, zero551, v1023, v456, 90); + svfloat32_t zero565 = svdup_n_f32(0); + svfloat32_t v565 = svcmla_f32_x(pred_full, zero565, v1025, v458, 90); + svfloat32_t zero572 = svdup_n_f32(0); + svfloat32_t v572 = svcmla_f32_x(pred_full, zero572, v1026, v459, 90); + svfloat32_t v231 = svadd_f32_x(svptrue_b32(), v228, v227); + svfloat32_t v234 = svsub_f32_x(svptrue_b32(), v233, v224); + svfloat32_t v314 = svmul_f32_x(svptrue_b32(), v243, v1019); + svfloat32_t zero377 = svdup_n_f32(0); + svfloat32_t v377 = svcmla_f32_x(pred_full, zero377, v1028, v252, 90); + svfloat32_t v379 = svmla_f32_x(pred_full, v279, v235, v1011); + svfloat32_t v380 = svmla_f32_x(pred_full, v284, v236, v1012); + svfloat32_t v381 = svnmls_f32_x(pred_full, v284, v235, v1011); + svfloat32_t v382 = svmla_f32_x(pred_full, v294, v238, v1014); + svfloat32_t v383 = svmla_f32_x(pred_full, v299, v239, v1015); + svfloat32_t v384 = svnmls_f32_x(pred_full, v299, v238, v1014); + svfloat32_t v387 = svcmla_f32_x(pred_full, v335, v1021, v245, 90); + svfloat32_t v388 = svsub_f32_x(svptrue_b32(), v321, v335); + svfloat32_t v389 = svcmla_f32_x(pred_full, v356, v1024, v248, 90); + svfloat32_t v390 = svsub_f32_x(svptrue_b32(), v342, v356); + svfloat32_t v440 = svadd_f32_x(svptrue_b32(), v437, v436); + svfloat32_t v443 = svsub_f32_x(svptrue_b32(), v442, v433); + svfloat32_t v523 = svmul_f32_x(svptrue_b32(), v452, v1019); + svfloat32_t zero586 = svdup_n_f32(0); + svfloat32_t v586 = svcmla_f32_x(pred_full, zero586, v1028, v461, 90); + svfloat32_t v588 = svmla_f32_x(pred_full, v488, v444, v1011); + svfloat32_t v589 = svmla_f32_x(pred_full, v493, v445, v1012); + svfloat32_t v590 = svnmls_f32_x(pred_full, v493, v444, v1011); + svfloat32_t v591 = svmla_f32_x(pred_full, v503, v447, v1014); + svfloat32_t v592 = svmla_f32_x(pred_full, v508, v448, v1015); + svfloat32_t v593 = svnmls_f32_x(pred_full, v508, v447, v1014); + svfloat32_t v596 = svcmla_f32_x(pred_full, v544, v1021, v454, 90); + svfloat32_t v597 = svsub_f32_x(svptrue_b32(), v530, v544); + svfloat32_t v598 = svcmla_f32_x(pred_full, v565, v1024, v457, 90); + svfloat32_t v599 = svsub_f32_x(svptrue_b32(), v551, v565); + svfloat32_t v232 = svadd_f32_x(svptrue_b32(), v34, v231); + svfloat32_t zero269 = svdup_n_f32(0); + svfloat32_t v269 = svcmla_f32_x(pred_full, zero269, v1010, v234, 90); + svfloat32_t v385 = svmla_f32_x(pred_full, v314, v242, v1018); + svfloat32_t v386 = svmla_f32_x(pred_full, v314, v241, v1017); + svfloat32_t v391 = svcmla_f32_x(pred_full, v377, v1027, v251, 90); + svfloat32_t v392 = svsub_f32_x(svptrue_b32(), v363, v377); + svfloat32_t v411 = svadd_f32_x(svptrue_b32(), v387, v388); + svfloat32_t v441 = svadd_f32_x(svptrue_b32(), v35, v440); + svfloat32_t zero478 = svdup_n_f32(0); + svfloat32_t v478 = svcmla_f32_x(pred_full, zero478, v1010, v443, 90); + svfloat32_t v594 = svmla_f32_x(pred_full, v523, v451, v1018); + svfloat32_t v595 = svmla_f32_x(pred_full, v523, v450, v1017); + svfloat32_t v600 = svcmla_f32_x(pred_full, v586, v1027, v460, 90); + svfloat32_t v601 = svsub_f32_x(svptrue_b32(), v572, v586); + svfloat32_t v620 = svadd_f32_x(svptrue_b32(), v596, v597); + svfloat32_t v378 = svmls_f32_x(pred_full, v232, v231, v1009); + svfloat32_t v393 = svadd_f32_x(svptrue_b32(), v383, v385); + svfloat32_t v403 = svadd_f32_x(svptrue_b32(), v269, v389); + svfloat32_t v405 = svsub_f32_x(svptrue_b32(), v391, v387); + svfloat32_t v407 = svadd_f32_x(svptrue_b32(), v269, v392); + svfloat32_t v409 = svsub_f32_x(svptrue_b32(), v392, v388); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v411, v389); + svfloat32_t v587 = svmls_f32_x(pred_full, v441, v440, v1009); + svfloat32_t v602 = svadd_f32_x(svptrue_b32(), v592, v594); + svfloat32_t v612 = svadd_f32_x(svptrue_b32(), v478, v598); + svfloat32_t v614 = svsub_f32_x(svptrue_b32(), v600, v596); + svfloat32_t v616 = svadd_f32_x(svptrue_b32(), v478, v601); + svfloat32_t v618 = svsub_f32_x(svptrue_b32(), v601, v597); + svfloat32_t v621 = svadd_f32_x(svptrue_b32(), v620, v598); + svst1_f64(pred_full, (double *)(v1036), svreinterpret_f64_f32(v232)); + svst1_f64(pred_full, (double *)(v1045), svreinterpret_f64_f32(v441)); + svfloat32_t v394 = svadd_f32_x(svptrue_b32(), v393, v378); + svfloat32_t v395 = svsub_f32_x(svptrue_b32(), v378, v380); + svfloat32_t v397 = svadd_f32_x(svptrue_b32(), v378, v384); + svfloat32_t v399 = svsub_f32_x(svptrue_b32(), v378, v381); + svfloat32_t v401 = svadd_f32_x(svptrue_b32(), v378, v379); + svfloat32_t v404 = svadd_f32_x(svptrue_b32(), v403, v391); + svfloat32_t v406 = svsub_f32_x(svptrue_b32(), v405, v269); + svfloat32_t v408 = svadd_f32_x(svptrue_b32(), v407, v390); + svfloat32_t v410 = svsub_f32_x(svptrue_b32(), v409, v269); + svfloat32_t v413 = svadd_f32_x(svptrue_b32(), v412, v390); + svfloat32_t v603 = svadd_f32_x(svptrue_b32(), v602, v587); + svfloat32_t v604 = svsub_f32_x(svptrue_b32(), v587, v589); + svfloat32_t v606 = svadd_f32_x(svptrue_b32(), v587, v593); + svfloat32_t v608 = svsub_f32_x(svptrue_b32(), v587, v590); + svfloat32_t v610 = svadd_f32_x(svptrue_b32(), v587, v588); + svfloat32_t v613 = svadd_f32_x(svptrue_b32(), v612, v600); + svfloat32_t v615 = svsub_f32_x(svptrue_b32(), v614, v478); + svfloat32_t v617 = svadd_f32_x(svptrue_b32(), v616, v599); + svfloat32_t v619 = svsub_f32_x(svptrue_b32(), v618, v478); + svfloat32_t v622 = svadd_f32_x(svptrue_b32(), v621, v599); + svfloat32_t v396 = svsub_f32_x(svptrue_b32(), v395, v385); + svfloat32_t v398 = svadd_f32_x(svptrue_b32(), v397, v386); + svfloat32_t v400 = svsub_f32_x(svptrue_b32(), v399, v386); + svfloat32_t v402 = svsub_f32_x(svptrue_b32(), v401, v382); + svfloat32_t v414 = svsub_f32_x(svptrue_b32(), v413, v269); + svfloat32_t v416 = svadd_f32_x(svptrue_b32(), v394, v404); + svfloat32_t v423 = svsub_f32_x(svptrue_b32(), v394, v404); + svfloat32_t v605 = svsub_f32_x(svptrue_b32(), v604, v594); + svfloat32_t v607 = svadd_f32_x(svptrue_b32(), v606, v595); + svfloat32_t v609 = svsub_f32_x(svptrue_b32(), v608, v595); + svfloat32_t v611 = svsub_f32_x(svptrue_b32(), v610, v591); + svfloat32_t v623 = svsub_f32_x(svptrue_b32(), v622, v478); + svfloat32_t v625 = svadd_f32_x(svptrue_b32(), v603, v613); + svfloat32_t v632 = svsub_f32_x(svptrue_b32(), v603, v613); + svfloat32_t v415 = svadd_f32_x(svptrue_b32(), v402, v414); + svfloat32_t v417 = svadd_f32_x(svptrue_b32(), v396, v406); + svfloat32_t v418 = svsub_f32_x(svptrue_b32(), v398, v408); + svfloat32_t v419 = svadd_f32_x(svptrue_b32(), v400, v410); + svfloat32_t v420 = svsub_f32_x(svptrue_b32(), v400, v410); + svfloat32_t v421 = svadd_f32_x(svptrue_b32(), v398, v408); + svfloat32_t v422 = svsub_f32_x(svptrue_b32(), v396, v406); + svfloat32_t v424 = svsub_f32_x(svptrue_b32(), v402, v414); + svfloat32_t v624 = svadd_f32_x(svptrue_b32(), v611, v623); + svfloat32_t v626 = svadd_f32_x(svptrue_b32(), v605, v615); + svfloat32_t v627 = svsub_f32_x(svptrue_b32(), v607, v617); + svfloat32_t v628 = svadd_f32_x(svptrue_b32(), v609, v619); + svfloat32_t v629 = svsub_f32_x(svptrue_b32(), v609, v619); + svfloat32_t v630 = svadd_f32_x(svptrue_b32(), v607, v617); + svfloat32_t v631 = svsub_f32_x(svptrue_b32(), v605, v615); + svfloat32_t v633 = svsub_f32_x(svptrue_b32(), v611, v623); + svst1_f64(pred_full, (double *)(v1072), svreinterpret_f64_f32(v423)); + svst1_f64(pred_full, (double *)(v1081), svreinterpret_f64_f32(v632)); + svst1_f64(pred_full, (double *)(v1198), svreinterpret_f64_f32(v416)); + svst1_f64(pred_full, (double *)(v1207), svreinterpret_f64_f32(v625)); + svst1_f64(pred_full, (double *)(v1054), svreinterpret_f64_f32(v424)); + svst1_f64(pred_full, (double *)(v1063), svreinterpret_f64_f32(v633)); + svst1_f64(pred_full, (double *)(v1090), svreinterpret_f64_f32(v422)); + svst1_f64(pred_full, (double *)(v1099), svreinterpret_f64_f32(v631)); + svst1_f64(pred_full, (double *)(v1108), svreinterpret_f64_f32(v421)); + svst1_f64(pred_full, (double *)(v1117), svreinterpret_f64_f32(v630)); + svst1_f64(pred_full, (double *)(v1126), svreinterpret_f64_f32(v420)); + svst1_f64(pred_full, (double *)(v1135), svreinterpret_f64_f32(v629)); + svst1_f64(pred_full, (double *)(v1144), svreinterpret_f64_f32(v419)); + svst1_f64(pred_full, (double *)(v1153), svreinterpret_f64_f32(v628)); + svst1_f64(pred_full, (double *)(v1162), svreinterpret_f64_f32(v418)); + svst1_f64(pred_full, (double *)(v1171), svreinterpret_f64_f32(v627)); + svst1_f64(pred_full, (double *)(v1180), svreinterpret_f64_f32(v417)); + svst1_f64(pred_full, (double *)(v1189), svreinterpret_f64_f32(v626)); + svst1_f64(pred_full, (double *)(v1216), svreinterpret_f64_f32(v415)); + svst1_f64(pred_full, (double *)(v1225), svreinterpret_f64_f32(v624)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu24(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v89 = vld1s_s16(&v5[istride]); + float v223 = 1.0000000000000000e+00F; + float v224 = -1.0000000000000000e+00F; + float v231 = -7.0710678118654746e-01F; + float v238 = 7.0710678118654757e-01F; + float v290 = -1.4999999999999998e+00F; + float v291 = 1.4999999999999998e+00F; + float v298 = 1.0606601717798210e+00F; + float v305 = -1.0606601717798212e+00F; + float v359 = 8.6602540378443871e-01F; + float v367 = -8.6602540378443871e-01F; + float v374 = 6.1237243569579458e-01F; + float v375 = -6.1237243569579458e-01F; + float32x2_t v377 = (float32x2_t){v4, v4}; + int16x4_t v34 = vld1s_s16(&v5[0]); + float32x2_t v90 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v89)), 15); + float32x2_t v225 = (float32x2_t){v223, v224}; + float32x2_t v232 = (float32x2_t){v238, v231}; + float32x2_t v239 = (float32x2_t){v238, v238}; + float32x2_t v288 = (float32x2_t){v290, v290}; + float32x2_t v292 = (float32x2_t){v290, v291}; + float32x2_t v299 = (float32x2_t){v305, v298}; + float32x2_t v306 = (float32x2_t){v305, v305}; + float32x2_t v361 = (float32x2_t){v359, v367}; + float32x2_t v368 = (float32x2_t){v367, v367}; + float32x2_t v372 = (float32x2_t){v375, v375}; + float32x2_t v376 = (float32x2_t){v374, v375}; + int16x4_t v20 = vld1s_s16(&v5[istride * 8]); + int16x4_t v26 = vld1s_s16(&v5[istride * 16]); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + int16x4_t v41 = vld1s_s16(&v5[istride * 11]); + int16x4_t v47 = vld1s_s16(&v5[istride * 19]); + int16x4_t v55 = vld1s_s16(&v5[istride * 3]); + int16x4_t v62 = vld1s_s16(&v5[istride * 14]); + int16x4_t v68 = vld1s_s16(&v5[istride * 22]); + int16x4_t v76 = vld1s_s16(&v5[istride * 6]); + int16x4_t v83 = vld1s_s16(&v5[istride * 17]); + int16x4_t v97 = vld1s_s16(&v5[istride * 9]); + int16x4_t v104 = vld1s_s16(&v5[istride * 20]); + int16x4_t v110 = vld1s_s16(&v5[istride * 4]); + int16x4_t v118 = vld1s_s16(&v5[istride * 12]); + int16x4_t v125 = vld1s_s16(&v5[istride * 23]); + int16x4_t v131 = vld1s_s16(&v5[istride * 7]); + int16x4_t v139 = vld1s_s16(&v5[istride * 15]); + int16x4_t v146 = vld1s_s16(&v5[istride * 2]); + int16x4_t v152 = vld1s_s16(&v5[istride * 10]); + int16x4_t v160 = vld1s_s16(&v5[istride * 18]); + int16x4_t v167 = vld1s_s16(&v5[istride * 5]); + int16x4_t v173 = vld1s_s16(&v5[istride * 13]); + int16x4_t v181 = vld1s_s16(&v5[istride * 21]); + float32x2_t v227 = vmul_f32(v377, v225); + float32x2_t v234 = vmul_f32(v377, v232); + float32x2_t v294 = vmul_f32(v377, v292); + float32x2_t v301 = vmul_f32(v377, v299); + float32x2_t v363 = vmul_f32(v377, v361); + float32x2_t v378 = vmul_f32(v377, v376); + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v42 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v41)), 15); + float32x2_t v48 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v47)), 15); + float32x2_t v56 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v55)), 15); + float32x2_t v63 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v62)), 15); + float32x2_t v69 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v68)), 15); + float32x2_t v77 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v76)), 15); + float32x2_t v84 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v83)), 15); + float32x2_t v98 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v97)), 15); + float32x2_t v105 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v104)), 15); + float32x2_t v111 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v110)), 15); + float32x2_t v119 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v118)), 15); + float32x2_t v126 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v125)), 15); + float32x2_t v132 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v131)), 15); + float32x2_t v140 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v139)), 15); + float32x2_t v147 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v146)), 15); + float32x2_t v153 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v152)), 15); + float32x2_t v161 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v160)), 15); + float32x2_t v168 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v167)), 15); + float32x2_t v174 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v173)), 15); + float32x2_t v182 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v181)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v49 = vadd_f32(v42, v48); + float32x2_t v50 = vsub_f32(v42, v48); + float32x2_t v70 = vadd_f32(v63, v69); + float32x2_t v71 = vsub_f32(v63, v69); + float32x2_t v91 = vadd_f32(v84, v90); + float32x2_t v92 = vsub_f32(v84, v90); + float32x2_t v112 = vadd_f32(v105, v111); + float32x2_t v113 = vsub_f32(v105, v111); + float32x2_t v133 = vadd_f32(v126, v132); + float32x2_t v134 = vsub_f32(v126, v132); + float32x2_t v154 = vadd_f32(v147, v153); + float32x2_t v155 = vsub_f32(v147, v153); + float32x2_t v175 = vadd_f32(v168, v174); + float32x2_t v176 = vsub_f32(v168, v174); + float32x2_t v36 = vadd_f32(v28, v35); + float32x2_t v57 = vadd_f32(v49, v56); + float32x2_t v78 = vadd_f32(v70, v77); + float32x2_t v99 = vadd_f32(v91, v98); + float32x2_t v120 = vadd_f32(v112, v119); + float32x2_t v141 = vadd_f32(v133, v140); + float32x2_t v162 = vadd_f32(v154, v161); + float32x2_t v183 = vadd_f32(v175, v182); + float32x2_t v251 = vadd_f32(v28, v112); + float32x2_t v252 = vsub_f32(v28, v112); + float32x2_t v253 = vadd_f32(v70, v154); + float32x2_t v254 = vsub_f32(v70, v154); + float32x2_t v255 = vadd_f32(v49, v133); + float32x2_t v256 = vsub_f32(v49, v133); + float32x2_t v257 = vadd_f32(v91, v175); + float32x2_t v258 = vsub_f32(v91, v175); + float32x2_t v318 = vadd_f32(v29, v113); + float32x2_t v319 = vsub_f32(v29, v113); + float32x2_t v320 = vadd_f32(v71, v155); + float32x2_t v321 = vsub_f32(v71, v155); + float32x2_t v322 = vadd_f32(v50, v134); + float32x2_t v323 = vsub_f32(v50, v134); + float32x2_t v324 = vadd_f32(v92, v176); + float32x2_t v325 = vsub_f32(v92, v176); + float32x2_t v184 = vadd_f32(v36, v120); + float32x2_t v185 = vsub_f32(v36, v120); + float32x2_t v186 = vadd_f32(v78, v162); + float32x2_t v187 = vsub_f32(v78, v162); + float32x2_t v188 = vadd_f32(v57, v141); + float32x2_t v189 = vsub_f32(v57, v141); + float32x2_t v190 = vadd_f32(v99, v183); + float32x2_t v191 = vsub_f32(v99, v183); + float32x2_t v259 = vadd_f32(v251, v253); + float32x2_t v260 = vsub_f32(v251, v253); + float32x2_t v261 = vadd_f32(v255, v257); + float32x2_t v262 = vsub_f32(v255, v257); + float32x2_t v265 = vadd_f32(v256, v258); + float32x2_t v266 = vsub_f32(v256, v258); + float32x2_t v289 = vmul_f32(v252, v288); + float32x2_t v295 = vrev64_f32(v254); + float32x2_t v326 = vadd_f32(v318, v320); + float32x2_t v327 = vsub_f32(v318, v320); + float32x2_t v328 = vadd_f32(v322, v324); + float32x2_t v329 = vsub_f32(v322, v324); + float32x2_t v332 = vadd_f32(v323, v325); + float32x2_t v333 = vsub_f32(v323, v325); + float32x2_t v364 = vrev64_f32(v319); + float32x2_t v369 = vmul_f32(v321, v368); + float32x2_t v192 = vadd_f32(v184, v186); + float32x2_t v193 = vsub_f32(v184, v186); + float32x2_t v194 = vadd_f32(v188, v190); + float32x2_t v195 = vsub_f32(v188, v190); + float32x2_t v198 = vadd_f32(v189, v191); + float32x2_t v199 = vsub_f32(v189, v191); + float32x2_t v228 = vrev64_f32(v187); + float32x2_t v263 = vadd_f32(v259, v261); + float32x2_t v264 = vsub_f32(v259, v261); + float32x2_t v278 = vmul_f32(v260, v288); + float32x2_t v284 = vrev64_f32(v262); + float32x2_t v296 = vmul_f32(v295, v294); + float32x2_t v302 = vrev64_f32(v265); + float32x2_t v307 = vmul_f32(v266, v306); + float32x2_t v330 = vadd_f32(v326, v328); + float32x2_t v331 = vsub_f32(v326, v328); + float32x2_t v353 = vrev64_f32(v327); + float32x2_t v358 = vmul_f32(v329, v368); + float32x2_t v365 = vmul_f32(v364, v363); + float32x2_t v373 = vmul_f32(v332, v372); + float32x2_t v379 = vrev64_f32(v333); + float32x2_t v196 = vadd_f32(v192, v194); + float32x2_t v197 = vsub_f32(v192, v194); + float32x2_t v217 = vrev64_f32(v195); + float32x2_t v229 = vmul_f32(v228, v227); + float32x2_t v235 = vrev64_f32(v198); + float32x2_t v240 = vmul_f32(v199, v239); + float32x2_t v270 = vmul_f32(v263, v288); + float32x2_t v274 = vmul_f32(v264, v288); + float32x2_t v285 = vmul_f32(v284, v294); + float32x2_t v303 = vmul_f32(v302, v301); + float32x2_t v310 = vadd_f32(v289, v307); + float32x2_t v311 = vsub_f32(v289, v307); + float32x2_t v339 = vrev64_f32(v330); + float32x2_t v346 = vrev64_f32(v331); + float32x2_t v354 = vmul_f32(v353, v363); + float32x2_t v380 = vmul_f32(v379, v378); + float32x2_t v385 = vadd_f32(v369, v373); + float32x2_t v386 = vsub_f32(v369, v373); + float32x2_t v218 = vmul_f32(v217, v227); + float32x2_t v236 = vmul_f32(v235, v234); + float32x2_t v243 = vadd_f32(v185, v240); + float32x2_t v244 = vsub_f32(v185, v240); + float32x2_t v308 = vadd_f32(v278, v285); + float32x2_t v309 = vsub_f32(v278, v285); + float32x2_t v312 = vadd_f32(v296, v303); + float32x2_t v313 = vsub_f32(v296, v303); + float32x2_t v340 = vmul_f32(v339, v363); + float32x2_t v347 = vmul_f32(v346, v363); + float32x2_t v381 = vadd_f32(v354, v358); + float32x2_t v382 = vsub_f32(v354, v358); + float32x2_t v383 = vadd_f32(v365, v380); + float32x2_t v384 = vsub_f32(v365, v380); + float32x2_t v391 = vadd_f32(v196, v270); + v6[0] = v196; + float32x2_t v463 = vadd_f32(v197, v274); + v6[ostride * 12] = v197; + float32x2_t v241 = vadd_f32(v193, v218); + float32x2_t v242 = vsub_f32(v193, v218); + float32x2_t v245 = vadd_f32(v229, v236); + float32x2_t v246 = vsub_f32(v229, v236); + float32x2_t v314 = vadd_f32(v310, v312); + float32x2_t v315 = vsub_f32(v310, v312); + float32x2_t v316 = vadd_f32(v311, v313); + float32x2_t v317 = vsub_f32(v311, v313); + float32x2_t v387 = vadd_f32(v383, v385); + float32x2_t v388 = vsub_f32(v383, v385); + float32x2_t v389 = vadd_f32(v384, v386); + float32x2_t v390 = vsub_f32(v384, v386); + float32x2_t v392 = vadd_f32(v391, v340); + float32x2_t v393 = vsub_f32(v391, v340); + float32x2_t v464 = vadd_f32(v463, v347); + float32x2_t v465 = vsub_f32(v463, v347); + float32x2_t v247 = vadd_f32(v243, v245); + float32x2_t v248 = vsub_f32(v243, v245); + float32x2_t v249 = vadd_f32(v244, v246); + float32x2_t v250 = vsub_f32(v244, v246); + v6[ostride * 16] = v393; + v6[ostride * 8] = v392; + float32x2_t v427 = vadd_f32(v242, v309); + v6[ostride * 18] = v242; + v6[ostride * 4] = v465; + v6[ostride * 20] = v464; + float32x2_t v499 = vadd_f32(v241, v308); + v6[ostride * 6] = v241; + float32x2_t v409 = vadd_f32(v248, v315); + v6[ostride * 9] = v248; + float32x2_t v428 = vadd_f32(v427, v382); + float32x2_t v429 = vsub_f32(v427, v382); + float32x2_t v445 = vadd_f32(v249, v316); + v6[ostride * 3] = v249; + float32x2_t v481 = vadd_f32(v250, v317); + v6[ostride * 21] = v250; + float32x2_t v500 = vadd_f32(v499, v381); + float32x2_t v501 = vsub_f32(v499, v381); + float32x2_t v517 = vadd_f32(v247, v314); + v6[ostride * 15] = v247; + float32x2_t v410 = vadd_f32(v409, v388); + float32x2_t v411 = vsub_f32(v409, v388); + v6[ostride * 10] = v429; + v6[ostride * 2] = v428; + float32x2_t v446 = vadd_f32(v445, v389); + float32x2_t v447 = vsub_f32(v445, v389); + float32x2_t v482 = vadd_f32(v481, v390); + float32x2_t v483 = vsub_f32(v481, v390); + v6[ostride * 22] = v501; + v6[ostride * 14] = v500; + float32x2_t v518 = vadd_f32(v517, v387); + float32x2_t v519 = vsub_f32(v517, v387); + v6[ostride] = v411; + v6[ostride * 17] = v410; + v6[ostride * 19] = v447; + v6[ostride * 11] = v446; + v6[ostride * 13] = v483; + v6[ostride * 5] = v482; + v6[ostride * 7] = v519; + v6[ostride * 23] = v518; + v5 += 1 * idist; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu24(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v278 = -1.0000000000000000e+00F; + float v285 = -7.0710678118654746e-01F; + float v292 = 7.0710678118654757e-01F; + float v345 = -1.4999999999999998e+00F; + float v350 = 1.4999999999999998e+00F; + float v357 = 1.0606601717798210e+00F; + float v364 = -1.0606601717798212e+00F; + float v428 = -8.6602540378443871e-01F; + float v438 = -6.1237243569579458e-01F; + const int32_t *v743 = &v5[v0]; + float32x2_t *v930 = &v6[v2]; + int64_t v19 = v0 * 8; + int64_t v27 = v0 * 16; + int64_t v46 = v0 * 11; + int64_t v54 = v0 * 19; + int64_t v64 = v0 * 3; + int64_t v73 = v0 * 14; + int64_t v81 = v0 * 22; + int64_t v91 = v0 * 6; + int64_t v100 = v0 * 17; + int64_t v118 = v0 * 9; + int64_t v127 = v0 * 20; + int64_t v135 = v0 * 4; + int64_t v145 = v0 * 12; + int64_t v154 = v0 * 23; + int64_t v162 = v0 * 7; + int64_t v172 = v0 * 15; + int64_t v181 = v0 * 2; + int64_t v189 = v0 * 10; + int64_t v199 = v0 * 18; + int64_t v208 = v0 * 5; + int64_t v216 = v0 * 13; + int64_t v226 = v0 * 21; + float v281 = v4 * v278; + float v288 = v4 * v285; + float v353 = v4 * v350; + float v360 = v4 * v357; + float v424 = v4 * v428; + float v441 = v4 * v438; + int64_t v465 = v2 * 16; + int64_t v472 = v2 * 8; + int64_t v482 = v2 * 9; + int64_t v496 = v2 * 17; + int64_t v506 = v2 * 18; + int64_t v513 = v2 * 10; + int64_t v520 = v2 * 2; + int64_t v530 = v2 * 3; + int64_t v537 = v2 * 19; + int64_t v544 = v2 * 11; + int64_t v554 = v2 * 12; + int64_t v561 = v2 * 4; + int64_t v568 = v2 * 20; + int64_t v578 = v2 * 21; + int64_t v585 = v2 * 13; + int64_t v592 = v2 * 5; + int64_t v602 = v2 * 6; + int64_t v609 = v2 * 22; + int64_t v616 = v2 * 14; + int64_t v626 = v2 * 15; + int64_t v633 = v2 * 7; + int64_t v640 = v2 * 23; + const int32_t *v671 = &v5[0]; + svint64_t v861 = svindex_s64(0, v1); + svfloat32_t v870 = svdup_n_f32(v292); + svfloat32_t v875 = svdup_n_f32(v345); + svfloat32_t v878 = svdup_n_f32(v364); + svfloat32_t v884 = svdup_n_f32(v428); + svfloat32_t v885 = svdup_n_f32(v438); + float32x2_t *v894 = &v6[0]; + const int32_t *v652 = &v5[v19]; + const int32_t *v661 = &v5[v27]; + svint16_t v673 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v671), v861)); + const int32_t *v680 = &v5[v46]; + const int32_t *v689 = &v5[v54]; + const int32_t *v698 = &v5[v64]; + const int32_t *v707 = &v5[v73]; + const int32_t *v716 = &v5[v81]; + const int32_t *v725 = &v5[v91]; + const int32_t *v734 = &v5[v100]; + svint16_t v745 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v743), v861)); + const int32_t *v752 = &v5[v118]; + const int32_t *v761 = &v5[v127]; + const int32_t *v770 = &v5[v135]; + const int32_t *v779 = &v5[v145]; + const int32_t *v788 = &v5[v154]; + const int32_t *v797 = &v5[v162]; + const int32_t *v806 = &v5[v172]; + const int32_t *v815 = &v5[v181]; + const int32_t *v824 = &v5[v189]; + const int32_t *v833 = &v5[v199]; + const int32_t *v842 = &v5[v208]; + const int32_t *v851 = &v5[v216]; + const int32_t *v860 = &v5[v226]; + svfloat32_t v868 = svdup_n_f32(v281); + svfloat32_t v869 = svdup_n_f32(v288); + svfloat32_t v876 = svdup_n_f32(v353); + svfloat32_t v877 = svdup_n_f32(v360); + svfloat32_t v883 = svdup_n_f32(v424); + svfloat32_t v886 = svdup_n_f32(v441); + float32x2_t *v903 = &v6[v465]; + float32x2_t *v912 = &v6[v472]; + float32x2_t *v921 = &v6[v482]; + float32x2_t *v939 = &v6[v496]; + float32x2_t *v948 = &v6[v506]; + float32x2_t *v957 = &v6[v513]; + float32x2_t *v966 = &v6[v520]; + float32x2_t *v975 = &v6[v530]; + float32x2_t *v984 = &v6[v537]; + float32x2_t *v993 = &v6[v544]; + float32x2_t *v1002 = &v6[v554]; + float32x2_t *v1011 = &v6[v561]; + float32x2_t *v1020 = &v6[v568]; + float32x2_t *v1029 = &v6[v578]; + float32x2_t *v1038 = &v6[v585]; + float32x2_t *v1047 = &v6[v592]; + float32x2_t *v1056 = &v6[v602]; + float32x2_t *v1065 = &v6[v609]; + float32x2_t *v1074 = &v6[v616]; + float32x2_t *v1083 = &v6[v626]; + float32x2_t *v1092 = &v6[v633]; + float32x2_t *v1101 = &v6[v640]; + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v673, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v114 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v745, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svint16_t v654 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v652), v861)); + svint16_t v663 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v661), v861)); + svint16_t v682 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v680), v861)); + svint16_t v691 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v689), v861)); + svint16_t v700 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v698), v861)); + svint16_t v709 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v707), v861)); + svint16_t v718 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v716), v861)); + svint16_t v727 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v725), v861)); + svint16_t v736 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v734), v861)); + svint16_t v754 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v752), v861)); + svint16_t v763 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v761), v861)); + svint16_t v772 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v770), v861)); + svint16_t v781 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v779), v861)); + svint16_t v790 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v788), v861)); + svint16_t v799 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v797), v861)); + svint16_t v808 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v806), v861)); + svint16_t v817 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v815), v861)); + svint16_t v826 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v824), v861)); + svint16_t v835 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v833), v861)); + svint16_t v844 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v842), v861)); + svint16_t v853 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v851), v861)); + svint16_t v862 = svreinterpret_s16_u64( + svld1uw_gather_s64index_u64(pred_full, (const unsigned *)(v860), v861)); + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v654, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v663, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v52 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v682, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v60 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v691, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v70 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v700, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v79 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v709, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v87 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v718, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v97 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v727, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v106 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v736, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v124 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v754, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v133 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v763, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v141 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v772, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v151 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v781, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v160 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v790, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v168 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v799, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v178 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v808, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v187 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v817, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v195 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v826, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v205 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v835, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v214 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v844, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v222 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v853, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v232 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v862, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v61 = svadd_f32_x(svptrue_b32(), v52, v60); + svfloat32_t v62 = svsub_f32_x(svptrue_b32(), v52, v60); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v115 = svadd_f32_x(svptrue_b32(), v106, v114); + svfloat32_t v116 = svsub_f32_x(svptrue_b32(), v106, v114); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v169 = svadd_f32_x(svptrue_b32(), v160, v168); + svfloat32_t v170 = svsub_f32_x(svptrue_b32(), v160, v168); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v187, v195); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v187, v195); + svfloat32_t v223 = svadd_f32_x(svptrue_b32(), v214, v222); + svfloat32_t v224 = svsub_f32_x(svptrue_b32(), v214, v222); + svfloat32_t v44 = svadd_f32_x(svptrue_b32(), v34, v43); + svfloat32_t v71 = svadd_f32_x(svptrue_b32(), v61, v70); + svfloat32_t v98 = svadd_f32_x(svptrue_b32(), v88, v97); + svfloat32_t v125 = svadd_f32_x(svptrue_b32(), v115, v124); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v142, v151); + svfloat32_t v179 = svadd_f32_x(svptrue_b32(), v169, v178); + svfloat32_t v206 = svadd_f32_x(svptrue_b32(), v196, v205); + svfloat32_t v233 = svadd_f32_x(svptrue_b32(), v223, v232); + svfloat32_t v306 = svadd_f32_x(svptrue_b32(), v34, v142); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v34, v142); + svfloat32_t v308 = svadd_f32_x(svptrue_b32(), v88, v196); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v88, v196); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v61, v169); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v61, v169); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v115, v223); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v115, v223); + svfloat32_t v378 = svadd_f32_x(svptrue_b32(), v35, v143); + svfloat32_t v379 = svsub_f32_x(svptrue_b32(), v35, v143); + svfloat32_t v380 = svadd_f32_x(svptrue_b32(), v89, v197); + svfloat32_t v381 = svsub_f32_x(svptrue_b32(), v89, v197); + svfloat32_t v382 = svadd_f32_x(svptrue_b32(), v62, v170); + svfloat32_t v383 = svsub_f32_x(svptrue_b32(), v62, v170); + svfloat32_t v384 = svadd_f32_x(svptrue_b32(), v116, v224); + svfloat32_t v385 = svsub_f32_x(svptrue_b32(), v116, v224); + svfloat32_t v234 = svadd_f32_x(svptrue_b32(), v44, v152); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v44, v152); + svfloat32_t v236 = svadd_f32_x(svptrue_b32(), v98, v206); + svfloat32_t v237 = svsub_f32_x(svptrue_b32(), v98, v206); + svfloat32_t v238 = svadd_f32_x(svptrue_b32(), v71, v179); + svfloat32_t v239 = svsub_f32_x(svptrue_b32(), v71, v179); + svfloat32_t v240 = svadd_f32_x(svptrue_b32(), v125, v233); + svfloat32_t v241 = svsub_f32_x(svptrue_b32(), v125, v233); + svfloat32_t v314 = svadd_f32_x(svptrue_b32(), v306, v308); + svfloat32_t v315 = svsub_f32_x(svptrue_b32(), v306, v308); + svfloat32_t v316 = svadd_f32_x(svptrue_b32(), v310, v312); + svfloat32_t v317 = svsub_f32_x(svptrue_b32(), v310, v312); + svfloat32_t v320 = svadd_f32_x(svptrue_b32(), v311, v313); + svfloat32_t v321 = svsub_f32_x(svptrue_b32(), v311, v313); + svfloat32_t zero355 = svdup_n_f32(0); + svfloat32_t v355 = svcmla_f32_x(pred_full, zero355, v876, v309, 90); + svfloat32_t v386 = svadd_f32_x(svptrue_b32(), v378, v380); + svfloat32_t v387 = svsub_f32_x(svptrue_b32(), v378, v380); + svfloat32_t v388 = svadd_f32_x(svptrue_b32(), v382, v384); + svfloat32_t v389 = svsub_f32_x(svptrue_b32(), v382, v384); + svfloat32_t v392 = svadd_f32_x(svptrue_b32(), v383, v385); + svfloat32_t v393 = svsub_f32_x(svptrue_b32(), v383, v385); + svfloat32_t zero426 = svdup_n_f32(0); + svfloat32_t v426 = svcmla_f32_x(pred_full, zero426, v883, v379, 90); + svfloat32_t v242 = svadd_f32_x(svptrue_b32(), v234, v236); + svfloat32_t v243 = svsub_f32_x(svptrue_b32(), v234, v236); + svfloat32_t v244 = svadd_f32_x(svptrue_b32(), v238, v240); + svfloat32_t v245 = svsub_f32_x(svptrue_b32(), v238, v240); + svfloat32_t v248 = svadd_f32_x(svptrue_b32(), v239, v241); + svfloat32_t v249 = svsub_f32_x(svptrue_b32(), v239, v241); + svfloat32_t zero283 = svdup_n_f32(0); + svfloat32_t v283 = svcmla_f32_x(pred_full, zero283, v868, v237, 90); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v314, v316); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v314, v316); + svfloat32_t zero343 = svdup_n_f32(0); + svfloat32_t v343 = svcmla_f32_x(pred_full, zero343, v876, v317, 90); + svfloat32_t zero362 = svdup_n_f32(0); + svfloat32_t v362 = svcmla_f32_x(pred_full, zero362, v877, v320, 90); + svfloat32_t v367 = svmul_f32_x(svptrue_b32(), v321, v878); + svfloat32_t v390 = svadd_f32_x(svptrue_b32(), v386, v388); + svfloat32_t v391 = svsub_f32_x(svptrue_b32(), v386, v388); + svfloat32_t zero414 = svdup_n_f32(0); + svfloat32_t v414 = svcmla_f32_x(pred_full, zero414, v883, v387, 90); + svfloat32_t v436 = svmul_f32_x(svptrue_b32(), v392, v885); + svfloat32_t zero443 = svdup_n_f32(0); + svfloat32_t v443 = svcmla_f32_x(pred_full, zero443, v886, v393, 90); + svfloat32_t v246 = svadd_f32_x(svptrue_b32(), v242, v244); + svfloat32_t v247 = svsub_f32_x(svptrue_b32(), v242, v244); + svfloat32_t zero271 = svdup_n_f32(0); + svfloat32_t v271 = svcmla_f32_x(pred_full, zero271, v868, v245, 90); + svfloat32_t zero290 = svdup_n_f32(0); + svfloat32_t v290 = svcmla_f32_x(pred_full, zero290, v869, v248, 90); + svfloat32_t v368 = svmla_f32_x(pred_full, v343, v315, v875); + svfloat32_t v369 = svnmls_f32_x(pred_full, v343, v315, v875); + svfloat32_t v370 = svmla_f32_x(pred_full, v367, v307, v875); + svfloat32_t v371 = svnmls_f32_x(pred_full, v367, v307, v875); + svfloat32_t v372 = svadd_f32_x(svptrue_b32(), v355, v362); + svfloat32_t v373 = svsub_f32_x(svptrue_b32(), v355, v362); + svfloat32_t zero400 = svdup_n_f32(0); + svfloat32_t v400 = svcmla_f32_x(pred_full, zero400, v883, v390, 90); + svfloat32_t zero407 = svdup_n_f32(0); + svfloat32_t v407 = svcmla_f32_x(pred_full, zero407, v883, v391, 90); + svfloat32_t v444 = svmla_f32_x(pred_full, v414, v389, v884); + svfloat32_t v445 = svmls_f32_x(pred_full, v414, v389, v884); + svfloat32_t v446 = svadd_f32_x(svptrue_b32(), v426, v443); + svfloat32_t v447 = svsub_f32_x(svptrue_b32(), v426, v443); + svfloat32_t v448 = svmla_f32_x(pred_full, v436, v381, v884); + svfloat32_t v449 = svnmls_f32_x(pred_full, v436, v381, v884); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v243, v271); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v243, v271); + svfloat32_t v298 = svmla_f32_x(pred_full, v235, v249, v870); + svfloat32_t v299 = svmls_f32_x(pred_full, v235, v249, v870); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v374 = svadd_f32_x(svptrue_b32(), v370, v372); + svfloat32_t v375 = svsub_f32_x(svptrue_b32(), v370, v372); + svfloat32_t v376 = svadd_f32_x(svptrue_b32(), v371, v373); + svfloat32_t v377 = svsub_f32_x(svptrue_b32(), v371, v373); + svfloat32_t v450 = svadd_f32_x(svptrue_b32(), v446, v448); + svfloat32_t v451 = svsub_f32_x(svptrue_b32(), v446, v448); + svfloat32_t v452 = svadd_f32_x(svptrue_b32(), v447, v449); + svfloat32_t v453 = svsub_f32_x(svptrue_b32(), v447, v449); + svfloat32_t v454 = svmla_f32_x(pred_full, v246, v318, v875); + svfloat32_t v550 = svmla_f32_x(pred_full, v247, v319, v875); + svst1_f64(pred_full, (double *)(v894), svreinterpret_f64_f32(v246)); + svst1_f64(pred_full, (double *)(v1002), svreinterpret_f64_f32(v247)); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v298, v300); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v298, v300); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v299, v301); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v299, v301); + svfloat32_t v455 = svadd_f32_x(svptrue_b32(), v454, v400); + svfloat32_t v456 = svsub_f32_x(svptrue_b32(), v454, v400); + svfloat32_t v502 = svadd_f32_x(svptrue_b32(), v297, v369); + svfloat32_t v551 = svadd_f32_x(svptrue_b32(), v550, v407); + svfloat32_t v552 = svsub_f32_x(svptrue_b32(), v550, v407); + svfloat32_t v598 = svadd_f32_x(svptrue_b32(), v296, v368); + svst1_f64(pred_full, (double *)(v948), svreinterpret_f64_f32(v297)); + svst1_f64(pred_full, (double *)(v1056), svreinterpret_f64_f32(v296)); + svfloat32_t v478 = svadd_f32_x(svptrue_b32(), v303, v375); + svfloat32_t v503 = svadd_f32_x(svptrue_b32(), v502, v445); + svfloat32_t v504 = svsub_f32_x(svptrue_b32(), v502, v445); + svfloat32_t v526 = svadd_f32_x(svptrue_b32(), v304, v376); + svfloat32_t v574 = svadd_f32_x(svptrue_b32(), v305, v377); + svfloat32_t v599 = svadd_f32_x(svptrue_b32(), v598, v444); + svfloat32_t v600 = svsub_f32_x(svptrue_b32(), v598, v444); + svfloat32_t v622 = svadd_f32_x(svptrue_b32(), v302, v374); + svst1_f64(pred_full, (double *)(v903), svreinterpret_f64_f32(v456)); + svst1_f64(pred_full, (double *)(v912), svreinterpret_f64_f32(v455)); + svst1_f64(pred_full, (double *)(v921), svreinterpret_f64_f32(v303)); + svst1_f64(pred_full, (double *)(v975), svreinterpret_f64_f32(v304)); + svst1_f64(pred_full, (double *)(v1011), svreinterpret_f64_f32(v552)); + svst1_f64(pred_full, (double *)(v1020), svreinterpret_f64_f32(v551)); + svst1_f64(pred_full, (double *)(v1029), svreinterpret_f64_f32(v305)); + svst1_f64(pred_full, (double *)(v1083), svreinterpret_f64_f32(v302)); + svfloat32_t v479 = svadd_f32_x(svptrue_b32(), v478, v451); + svfloat32_t v480 = svsub_f32_x(svptrue_b32(), v478, v451); + svfloat32_t v527 = svadd_f32_x(svptrue_b32(), v526, v452); + svfloat32_t v528 = svsub_f32_x(svptrue_b32(), v526, v452); + svfloat32_t v575 = svadd_f32_x(svptrue_b32(), v574, v453); + svfloat32_t v576 = svsub_f32_x(svptrue_b32(), v574, v453); + svfloat32_t v623 = svadd_f32_x(svptrue_b32(), v622, v450); + svfloat32_t v624 = svsub_f32_x(svptrue_b32(), v622, v450); + svst1_f64(pred_full, (double *)(v957), svreinterpret_f64_f32(v504)); + svst1_f64(pred_full, (double *)(v966), svreinterpret_f64_f32(v503)); + svst1_f64(pred_full, (double *)(v1065), svreinterpret_f64_f32(v600)); + svst1_f64(pred_full, (double *)(v1074), svreinterpret_f64_f32(v599)); + svst1_f64(pred_full, (double *)(v930), svreinterpret_f64_f32(v480)); + svst1_f64(pred_full, (double *)(v939), svreinterpret_f64_f32(v479)); + svst1_f64(pred_full, (double *)(v984), svreinterpret_f64_f32(v528)); + svst1_f64(pred_full, (double *)(v993), svreinterpret_f64_f32(v527)); + svst1_f64(pred_full, (double *)(v1038), svreinterpret_f64_f32(v576)); + svst1_f64(pred_full, (double *)(v1047), svreinterpret_f64_f32(v575)); + svst1_f64(pred_full, (double *)(v1092), svreinterpret_f64_f32(v624)); + svst1_f64(pred_full, (double *)(v1101), svreinterpret_f64_f32(v623)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu25(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v164 = vld1s_s16(&v5[istride]); + float v876 = 9.6858316112863108e-01F; + float v879 = -2.4868988716485479e-01F; + float v880 = 2.4868988716485479e-01F; + float v1015 = 8.7630668004386358e-01F; + float v1018 = -4.8175367410171532e-01F; + float v1019 = 4.8175367410171532e-01F; + float v1154 = 7.2896862742141155e-01F; + float v1157 = -6.8454710592868862e-01F; + float v1158 = 6.8454710592868862e-01F; + float v1166 = 6.2790519529313527e-02F; + float v1169 = -9.9802672842827156e-01F; + float v1170 = 9.9802672842827156e-01F; + float v1293 = 5.3582679497899655e-01F; + float v1296 = -8.4432792550201508e-01F; + float v1297 = 8.4432792550201508e-01F; + float v1305 = -4.2577929156507272e-01F; + float v1308 = -9.0482705246601947e-01F; + float v1309 = 9.0482705246601947e-01F; + float v1317 = -6.3742398974868952e-01F; + float v1320 = 7.7051324277578936e-01F; + float v1321 = -7.7051324277578936e-01F; + float v1335 = -9.9211470131447776e-01F; + float v1338 = -1.2533323356430454e-01F; + float v1339 = 1.2533323356430454e-01F; + float v1355 = 2.5000000000000000e-01F; + float v1365 = 5.5901699437494745e-01F; + float v1375 = 6.1803398874989490e-01F; + float v1398 = 9.5105651629515353e-01F; + float v1399 = -9.5105651629515353e-01F; + float32x2_t v1401 = (float32x2_t){v4, v4}; + float v1422 = 2.0000000000000000e+00F; + int16x4_t v20 = vld1s_s16(&v5[0]); + float32x2_t v165 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v164)), 15); + float32x2_t v877 = (float32x2_t){v876, v876}; + float32x2_t v881 = (float32x2_t){v879, v880}; + float32x2_t v1016 = (float32x2_t){v1015, v1015}; + float32x2_t v1020 = (float32x2_t){v1018, v1019}; + float32x2_t v1155 = (float32x2_t){v1154, v1154}; + float32x2_t v1159 = (float32x2_t){v1157, v1158}; + float32x2_t v1167 = (float32x2_t){v1166, v1166}; + float32x2_t v1171 = (float32x2_t){v1169, v1170}; + float32x2_t v1201 = (float32x2_t){v1321, v1320}; + float32x2_t v1294 = (float32x2_t){v1293, v1293}; + float32x2_t v1298 = (float32x2_t){v1296, v1297}; + float32x2_t v1306 = (float32x2_t){v1305, v1305}; + float32x2_t v1310 = (float32x2_t){v1308, v1309}; + float32x2_t v1318 = (float32x2_t){v1317, v1317}; + float32x2_t v1322 = (float32x2_t){v1320, v1321}; + float32x2_t v1336 = (float32x2_t){v1335, v1335}; + float32x2_t v1340 = (float32x2_t){v1338, v1339}; + float32x2_t v1356 = (float32x2_t){v1355, v1355}; + float32x2_t v1366 = (float32x2_t){v1365, v1365}; + float32x2_t v1376 = (float32x2_t){v1375, v1375}; + float32x2_t v1400 = (float32x2_t){v1398, v1399}; + float32x2_t v1423 = (float32x2_t){v1422, v1422}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v26 = vld1s_s16(&v5[istride * 5]); + int16x4_t v32 = vld1s_s16(&v5[istride * 10]); + int16x4_t v38 = vld1s_s16(&v5[istride * 15]); + int16x4_t v44 = vld1s_s16(&v5[istride * 20]); + int16x4_t v170 = vld1s_s16(&v5[istride * 6]); + int16x4_t v176 = vld1s_s16(&v5[istride * 11]); + int16x4_t v182 = vld1s_s16(&v5[istride * 16]); + int16x4_t v188 = vld1s_s16(&v5[istride * 21]); + int16x4_t v308 = vld1s_s16(&v5[istride * 2]); + int16x4_t v314 = vld1s_s16(&v5[istride * 7]); + int16x4_t v320 = vld1s_s16(&v5[istride * 12]); + int16x4_t v326 = vld1s_s16(&v5[istride * 17]); + int16x4_t v332 = vld1s_s16(&v5[istride * 22]); + int16x4_t v452 = vld1s_s16(&v5[istride * 3]); + int16x4_t v458 = vld1s_s16(&v5[istride * 8]); + int16x4_t v464 = vld1s_s16(&v5[istride * 13]); + int16x4_t v470 = vld1s_s16(&v5[istride * 18]); + int16x4_t v476 = vld1s_s16(&v5[istride * 23]); + int16x4_t v596 = vld1s_s16(&v5[istride * 4]); + int16x4_t v602 = vld1s_s16(&v5[istride * 9]); + int16x4_t v608 = vld1s_s16(&v5[istride * 14]); + int16x4_t v614 = vld1s_s16(&v5[istride * 19]); + int16x4_t v620 = vld1s_s16(&v5[istride * 24]); + float32x2_t v883 = vmul_f32(v1401, v881); + float32x2_t v1022 = vmul_f32(v1401, v1020); + float32x2_t v1161 = vmul_f32(v1401, v1159); + float32x2_t v1173 = vmul_f32(v1401, v1171); + float32x2_t v1203 = vmul_f32(v1401, v1201); + float32x2_t v1300 = vmul_f32(v1401, v1298); + float32x2_t v1312 = vmul_f32(v1401, v1310); + float32x2_t v1324 = vmul_f32(v1401, v1322); + float32x2_t v1342 = vmul_f32(v1401, v1340); + float32x2_t v1402 = vmul_f32(v1401, v1400); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v33 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v32)), 15); + float32x2_t v39 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v38)), 15); + float32x2_t v45 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v44)), 15); + float32x2_t v171 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v170)), 15); + float32x2_t v177 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v176)), 15); + float32x2_t v183 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v182)), 15); + float32x2_t v189 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v188)), 15); + float32x2_t v309 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v308)), 15); + float32x2_t v315 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v314)), 15); + float32x2_t v321 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v320)), 15); + float32x2_t v327 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v326)), 15); + float32x2_t v333 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v332)), 15); + float32x2_t v453 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v452)), 15); + float32x2_t v459 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v458)), 15); + float32x2_t v465 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v464)), 15); + float32x2_t v471 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v470)), 15); + float32x2_t v477 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v476)), 15); + float32x2_t v597 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v596)), 15); + float32x2_t v603 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v602)), 15); + float32x2_t v609 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v608)), 15); + float32x2_t v615 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v614)), 15); + float32x2_t v621 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v620)), 15); + float32x2_t v82 = vsub_f32(v27, v45); + float32x2_t v86 = vmul_f32(v27, v1423); + float32x2_t v100 = vsub_f32(v33, v39); + float32x2_t v104 = vmul_f32(v33, v1423); + float32x2_t v226 = vsub_f32(v171, v189); + float32x2_t v230 = vmul_f32(v171, v1423); + float32x2_t v244 = vsub_f32(v177, v183); + float32x2_t v248 = vmul_f32(v177, v1423); + float32x2_t v370 = vsub_f32(v315, v333); + float32x2_t v374 = vmul_f32(v315, v1423); + float32x2_t v388 = vsub_f32(v321, v327); + float32x2_t v392 = vmul_f32(v321, v1423); + float32x2_t v514 = vsub_f32(v459, v477); + float32x2_t v518 = vmul_f32(v459, v1423); + float32x2_t v532 = vsub_f32(v465, v471); + float32x2_t v536 = vmul_f32(v465, v1423); + float32x2_t v658 = vsub_f32(v603, v621); + float32x2_t v662 = vmul_f32(v603, v1423); + float32x2_t v676 = vsub_f32(v609, v615); + float32x2_t v680 = vmul_f32(v609, v1423); + float32x2_t v87 = vsub_f32(v86, v82); + float32x2_t v105 = vsub_f32(v104, v100); + float32x2_t v116 = vmul_f32(v100, v1376); + float32x2_t v131 = vmul_f32(v82, v1376); + float32x2_t v231 = vsub_f32(v230, v226); + float32x2_t v249 = vsub_f32(v248, v244); + float32x2_t v260 = vmul_f32(v244, v1376); + float32x2_t v275 = vmul_f32(v226, v1376); + float32x2_t v375 = vsub_f32(v374, v370); + float32x2_t v393 = vsub_f32(v392, v388); + float32x2_t v404 = vmul_f32(v388, v1376); + float32x2_t v419 = vmul_f32(v370, v1376); + float32x2_t v519 = vsub_f32(v518, v514); + float32x2_t v537 = vsub_f32(v536, v532); + float32x2_t v548 = vmul_f32(v532, v1376); + float32x2_t v563 = vmul_f32(v514, v1376); + float32x2_t v663 = vsub_f32(v662, v658); + float32x2_t v681 = vsub_f32(v680, v676); + float32x2_t v692 = vmul_f32(v676, v1376); + float32x2_t v707 = vmul_f32(v658, v1376); + float32x2_t v106 = vadd_f32(v87, v105); + float32x2_t v107 = vsub_f32(v87, v105); + float32x2_t v117 = vadd_f32(v82, v116); + float32x2_t v132 = vsub_f32(v131, v100); + float32x2_t v250 = vadd_f32(v231, v249); + float32x2_t v251 = vsub_f32(v231, v249); + float32x2_t v261 = vadd_f32(v226, v260); + float32x2_t v276 = vsub_f32(v275, v244); + float32x2_t v394 = vadd_f32(v375, v393); + float32x2_t v395 = vsub_f32(v375, v393); + float32x2_t v405 = vadd_f32(v370, v404); + float32x2_t v420 = vsub_f32(v419, v388); + float32x2_t v538 = vadd_f32(v519, v537); + float32x2_t v539 = vsub_f32(v519, v537); + float32x2_t v549 = vadd_f32(v514, v548); + float32x2_t v564 = vsub_f32(v563, v532); + float32x2_t v682 = vadd_f32(v663, v681); + float32x2_t v683 = vsub_f32(v663, v681); + float32x2_t v693 = vadd_f32(v658, v692); + float32x2_t v708 = vsub_f32(v707, v676); + float32x2_t v111 = vmul_f32(v106, v1356); + float32x2_t v121 = vmul_f32(v107, v1366); + float32x2_t v133 = vadd_f32(v21, v106); + float32x2_t v139 = vrev64_f32(v117); + float32x2_t v147 = vrev64_f32(v132); + float32x2_t v255 = vmul_f32(v250, v1356); + float32x2_t v265 = vmul_f32(v251, v1366); + float32x2_t v277 = vadd_f32(v165, v250); + float32x2_t v283 = vrev64_f32(v261); + float32x2_t v291 = vrev64_f32(v276); + float32x2_t v399 = vmul_f32(v394, v1356); + float32x2_t v409 = vmul_f32(v395, v1366); + float32x2_t v421 = vadd_f32(v309, v394); + float32x2_t v427 = vrev64_f32(v405); + float32x2_t v435 = vrev64_f32(v420); + float32x2_t v543 = vmul_f32(v538, v1356); + float32x2_t v553 = vmul_f32(v539, v1366); + float32x2_t v565 = vadd_f32(v453, v538); + float32x2_t v571 = vrev64_f32(v549); + float32x2_t v579 = vrev64_f32(v564); + float32x2_t v687 = vmul_f32(v682, v1356); + float32x2_t v697 = vmul_f32(v683, v1366); + float32x2_t v709 = vadd_f32(v597, v682); + float32x2_t v715 = vrev64_f32(v693); + float32x2_t v723 = vrev64_f32(v708); + float32x2_t v112 = vsub_f32(v21, v111); + float32x2_t v140 = vmul_f32(v139, v1402); + float32x2_t v148 = vmul_f32(v147, v1402); + float32x2_t v256 = vsub_f32(v165, v255); + float32x2_t v284 = vmul_f32(v283, v1402); + float32x2_t v292 = vmul_f32(v291, v1402); + float32x2_t v400 = vsub_f32(v309, v399); + float32x2_t v428 = vmul_f32(v427, v1402); + float32x2_t v436 = vmul_f32(v435, v1402); + float32x2_t v544 = vsub_f32(v453, v543); + float32x2_t v572 = vmul_f32(v571, v1402); + float32x2_t v580 = vmul_f32(v579, v1402); + float32x2_t v688 = vsub_f32(v597, v687); + float32x2_t v716 = vmul_f32(v715, v1402); + float32x2_t v724 = vmul_f32(v723, v1402); + float32x2_t v772 = vsub_f32(v277, v709); + float32x2_t v776 = vmul_f32(v277, v1423); + float32x2_t v790 = vsub_f32(v421, v565); + float32x2_t v794 = vmul_f32(v421, v1423); + float32x2_t v122 = vsub_f32(v112, v121); + float32x2_t v126 = vmul_f32(v112, v1423); + float32x2_t v266 = vsub_f32(v256, v265); + float32x2_t v270 = vmul_f32(v256, v1423); + float32x2_t v410 = vsub_f32(v400, v409); + float32x2_t v414 = vmul_f32(v400, v1423); + float32x2_t v554 = vsub_f32(v544, v553); + float32x2_t v558 = vmul_f32(v544, v1423); + float32x2_t v698 = vsub_f32(v688, v697); + float32x2_t v702 = vmul_f32(v688, v1423); + float32x2_t v777 = vsub_f32(v776, v772); + float32x2_t v795 = vsub_f32(v794, v790); + float32x2_t v806 = vmul_f32(v790, v1376); + float32x2_t v821 = vmul_f32(v772, v1376); + float32x2_t v127 = vsub_f32(v126, v122); + float32x2_t v149 = vsub_f32(v122, v148); + float32x2_t v153 = vmul_f32(v122, v1423); + float32x2_t v271 = vsub_f32(v270, v266); + float32x2_t v293 = vsub_f32(v266, v292); + float32x2_t v297 = vmul_f32(v266, v1423); + float32x2_t v415 = vsub_f32(v414, v410); + float32x2_t v437 = vsub_f32(v410, v436); + float32x2_t v441 = vmul_f32(v410, v1423); + float32x2_t v559 = vsub_f32(v558, v554); + float32x2_t v581 = vsub_f32(v554, v580); + float32x2_t v585 = vmul_f32(v554, v1423); + float32x2_t v703 = vsub_f32(v702, v698); + float32x2_t v725 = vsub_f32(v698, v724); + float32x2_t v729 = vmul_f32(v698, v1423); + float32x2_t v796 = vadd_f32(v777, v795); + float32x2_t v797 = vsub_f32(v777, v795); + float32x2_t v807 = vadd_f32(v772, v806); + float32x2_t v822 = vsub_f32(v821, v790); + float32x2_t v141 = vsub_f32(v127, v140); + float32x2_t v154 = vsub_f32(v153, v149); + float32x2_t v158 = vmul_f32(v127, v1423); + float32x2_t v285 = vsub_f32(v271, v284); + float32x2_t v298 = vsub_f32(v297, v293); + float32x2_t v302 = vmul_f32(v271, v1423); + float32x2_t v429 = vsub_f32(v415, v428); + float32x2_t v442 = vsub_f32(v441, v437); + float32x2_t v446 = vmul_f32(v415, v1423); + float32x2_t v573 = vsub_f32(v559, v572); + float32x2_t v586 = vsub_f32(v585, v581); + float32x2_t v590 = vmul_f32(v559, v1423); + float32x2_t v717 = vsub_f32(v703, v716); + float32x2_t v730 = vsub_f32(v729, v725); + float32x2_t v734 = vmul_f32(v703, v1423); + float32x2_t v801 = vmul_f32(v796, v1356); + float32x2_t v811 = vmul_f32(v797, v1366); + float32x2_t v823 = vadd_f32(v133, v796); + float32x2_t v834 = vrev64_f32(v807); + float32x2_t v847 = vrev64_f32(v822); + float32x2_t v1023 = vrev64_f32(v293); + float32x2_t v1035 = vrev64_f32(v437); + float32x2_t v1047 = vrev64_f32(v725); + float32x2_t v1065 = vrev64_f32(v581); + float32x2_t v159 = vsub_f32(v158, v141); + float32x2_t v303 = vsub_f32(v302, v285); + float32x2_t v447 = vsub_f32(v446, v429); + float32x2_t v591 = vsub_f32(v590, v573); + float32x2_t v735 = vsub_f32(v734, v717); + float32x2_t v802 = vsub_f32(v133, v801); + v6[0] = v823; + float32x2_t v835 = vmul_f32(v834, v1402); + float32x2_t v848 = vmul_f32(v847, v1402); + float32x2_t v884 = vrev64_f32(v285); + float32x2_t v896 = vrev64_f32(v429); + float32x2_t v908 = vrev64_f32(v717); + float32x2_t v926 = vrev64_f32(v573); + float32x2_t v1024 = vmul_f32(v1023, v1022); + float32x2_t v1036 = vmul_f32(v1035, v1300); + float32x2_t v1048 = vmul_f32(v1047, v1312); + float32x2_t v1066 = vmul_f32(v1065, v1173); + float32x2_t v1162 = vrev64_f32(v298); + float32x2_t v1174 = vrev64_f32(v442); + float32x2_t v1186 = vrev64_f32(v730); + float32x2_t v1204 = vrev64_f32(v586); + float32x2_t v812 = vsub_f32(v802, v811); + float32x2_t v816 = vmul_f32(v802, v1423); + float32x2_t v885 = vmul_f32(v884, v883); + float32x2_t v897 = vmul_f32(v896, v1022); + float32x2_t v909 = vmul_f32(v908, v1300); + float32x2_t v927 = vmul_f32(v926, v1161); + float32x2_t v1025 = vfma_f32(v1024, v293, v1016); + float32x2_t v1037 = vfma_f32(v1036, v437, v1294); + float32x2_t v1049 = vfma_f32(v1048, v725, v1306); + float32x2_t v1067 = vfma_f32(v1066, v581, v1167); + float32x2_t v1163 = vmul_f32(v1162, v1161); + float32x2_t v1175 = vmul_f32(v1174, v1173); + float32x2_t v1187 = vmul_f32(v1186, v1342); + float32x2_t v1205 = vmul_f32(v1204, v1203); + float32x2_t v1301 = vrev64_f32(v303); + float32x2_t v1313 = vrev64_f32(v447); + float32x2_t v1325 = vrev64_f32(v735); + float32x2_t v1343 = vrev64_f32(v591); + float32x2_t v817 = vsub_f32(v816, v812); + float32x2_t v849 = vsub_f32(v812, v848); + float32x2_t v858 = vmul_f32(v812, v1423); + float32x2_t v886 = vfma_f32(v885, v285, v877); + float32x2_t v898 = vfma_f32(v897, v429, v1016); + float32x2_t v910 = vfma_f32(v909, v717, v1294); + float32x2_t v928 = vfma_f32(v927, v573, v1155); + float32x2_t v1050 = vsub_f32(v1025, v1049); + float32x2_t v1054 = vmul_f32(v1025, v1423); + float32x2_t v1068 = vsub_f32(v1037, v1067); + float32x2_t v1072 = vmul_f32(v1037, v1423); + float32x2_t v1164 = vfma_f32(v1163, v298, v1155); + float32x2_t v1176 = vfma_f32(v1175, v442, v1167); + float32x2_t v1188 = vfma_f32(v1187, v730, v1336); + float32x2_t v1206 = vfma_f32(v1205, v586, v1318); + float32x2_t v1302 = vmul_f32(v1301, v1300); + float32x2_t v1314 = vmul_f32(v1313, v1312); + float32x2_t v1326 = vmul_f32(v1325, v1324); + float32x2_t v1344 = vmul_f32(v1343, v1342); + float32x2_t v836 = vsub_f32(v817, v835); + v6[ostride * 10] = v849; + float32x2_t v859 = vsub_f32(v858, v849); + float32x2_t v868 = vmul_f32(v817, v1423); + float32x2_t v911 = vsub_f32(v886, v910); + float32x2_t v915 = vmul_f32(v886, v1423); + float32x2_t v929 = vsub_f32(v898, v928); + float32x2_t v933 = vmul_f32(v898, v1423); + float32x2_t v1055 = vsub_f32(v1054, v1050); + float32x2_t v1073 = vsub_f32(v1072, v1068); + float32x2_t v1084 = vmul_f32(v1068, v1376); + float32x2_t v1099 = vmul_f32(v1050, v1376); + float32x2_t v1189 = vsub_f32(v1164, v1188); + float32x2_t v1193 = vmul_f32(v1164, v1423); + float32x2_t v1207 = vsub_f32(v1176, v1206); + float32x2_t v1211 = vmul_f32(v1176, v1423); + float32x2_t v1303 = vfma_f32(v1302, v303, v1294); + float32x2_t v1315 = vfma_f32(v1314, v447, v1306); + float32x2_t v1327 = vfma_f32(v1326, v735, v1318); + float32x2_t v1345 = vfma_f32(v1344, v591, v1336); + v6[ostride * 5] = v836; + v6[ostride * 15] = v859; + float32x2_t v869 = vsub_f32(v868, v836); + float32x2_t v916 = vsub_f32(v915, v911); + float32x2_t v934 = vsub_f32(v933, v929); + float32x2_t v945 = vmul_f32(v929, v1376); + float32x2_t v960 = vmul_f32(v911, v1376); + float32x2_t v1074 = vadd_f32(v1055, v1073); + float32x2_t v1075 = vsub_f32(v1055, v1073); + float32x2_t v1085 = vadd_f32(v1050, v1084); + float32x2_t v1100 = vsub_f32(v1099, v1068); + float32x2_t v1194 = vsub_f32(v1193, v1189); + float32x2_t v1212 = vsub_f32(v1211, v1207); + float32x2_t v1223 = vmul_f32(v1207, v1376); + float32x2_t v1238 = vmul_f32(v1189, v1376); + float32x2_t v1328 = vsub_f32(v1303, v1327); + float32x2_t v1332 = vmul_f32(v1303, v1423); + float32x2_t v1346 = vsub_f32(v1315, v1345); + float32x2_t v1350 = vmul_f32(v1315, v1423); + v6[ostride * 20] = v869; + float32x2_t v935 = vadd_f32(v916, v934); + float32x2_t v936 = vsub_f32(v916, v934); + float32x2_t v946 = vadd_f32(v911, v945); + float32x2_t v961 = vsub_f32(v960, v929); + float32x2_t v1079 = vmul_f32(v1074, v1356); + float32x2_t v1089 = vmul_f32(v1075, v1366); + float32x2_t v1101 = vadd_f32(v149, v1074); + float32x2_t v1112 = vrev64_f32(v1085); + float32x2_t v1125 = vrev64_f32(v1100); + float32x2_t v1213 = vadd_f32(v1194, v1212); + float32x2_t v1214 = vsub_f32(v1194, v1212); + float32x2_t v1224 = vadd_f32(v1189, v1223); + float32x2_t v1239 = vsub_f32(v1238, v1207); + float32x2_t v1333 = vsub_f32(v1332, v1328); + float32x2_t v1351 = vsub_f32(v1350, v1346); + float32x2_t v1362 = vmul_f32(v1346, v1376); + float32x2_t v1377 = vmul_f32(v1328, v1376); + float32x2_t v940 = vmul_f32(v935, v1356); + float32x2_t v950 = vmul_f32(v936, v1366); + float32x2_t v962 = vadd_f32(v141, v935); + float32x2_t v973 = vrev64_f32(v946); + float32x2_t v986 = vrev64_f32(v961); + float32x2_t v1080 = vsub_f32(v149, v1079); + v6[ostride * 2] = v1101; + float32x2_t v1113 = vmul_f32(v1112, v1402); + float32x2_t v1126 = vmul_f32(v1125, v1402); + float32x2_t v1218 = vmul_f32(v1213, v1356); + float32x2_t v1228 = vmul_f32(v1214, v1366); + float32x2_t v1240 = vadd_f32(v154, v1213); + float32x2_t v1251 = vrev64_f32(v1224); + float32x2_t v1264 = vrev64_f32(v1239); + float32x2_t v1352 = vadd_f32(v1333, v1351); + float32x2_t v1353 = vsub_f32(v1333, v1351); + float32x2_t v1363 = vadd_f32(v1328, v1362); + float32x2_t v1378 = vsub_f32(v1377, v1346); + float32x2_t v941 = vsub_f32(v141, v940); + v6[ostride] = v962; + float32x2_t v974 = vmul_f32(v973, v1402); + float32x2_t v987 = vmul_f32(v986, v1402); + float32x2_t v1090 = vsub_f32(v1080, v1089); + float32x2_t v1094 = vmul_f32(v1080, v1423); + float32x2_t v1219 = vsub_f32(v154, v1218); + v6[ostride * 3] = v1240; + float32x2_t v1252 = vmul_f32(v1251, v1402); + float32x2_t v1265 = vmul_f32(v1264, v1402); + float32x2_t v1357 = vmul_f32(v1352, v1356); + float32x2_t v1367 = vmul_f32(v1353, v1366); + float32x2_t v1379 = vadd_f32(v159, v1352); + float32x2_t v1390 = vrev64_f32(v1363); + float32x2_t v1403 = vrev64_f32(v1378); + float32x2_t v951 = vsub_f32(v941, v950); + float32x2_t v955 = vmul_f32(v941, v1423); + float32x2_t v1095 = vsub_f32(v1094, v1090); + float32x2_t v1127 = vsub_f32(v1090, v1126); + float32x2_t v1136 = vmul_f32(v1090, v1423); + float32x2_t v1229 = vsub_f32(v1219, v1228); + float32x2_t v1233 = vmul_f32(v1219, v1423); + float32x2_t v1358 = vsub_f32(v159, v1357); + v6[ostride * 4] = v1379; + float32x2_t v1391 = vmul_f32(v1390, v1402); + float32x2_t v1404 = vmul_f32(v1403, v1402); + float32x2_t v956 = vsub_f32(v955, v951); + float32x2_t v988 = vsub_f32(v951, v987); + float32x2_t v997 = vmul_f32(v951, v1423); + float32x2_t v1114 = vsub_f32(v1095, v1113); + v6[ostride * 12] = v1127; + float32x2_t v1137 = vsub_f32(v1136, v1127); + float32x2_t v1146 = vmul_f32(v1095, v1423); + float32x2_t v1234 = vsub_f32(v1233, v1229); + float32x2_t v1266 = vsub_f32(v1229, v1265); + float32x2_t v1275 = vmul_f32(v1229, v1423); + float32x2_t v1368 = vsub_f32(v1358, v1367); + float32x2_t v1372 = vmul_f32(v1358, v1423); + float32x2_t v975 = vsub_f32(v956, v974); + v6[ostride * 11] = v988; + float32x2_t v998 = vsub_f32(v997, v988); + float32x2_t v1007 = vmul_f32(v956, v1423); + v6[ostride * 7] = v1114; + v6[ostride * 17] = v1137; + float32x2_t v1147 = vsub_f32(v1146, v1114); + float32x2_t v1253 = vsub_f32(v1234, v1252); + v6[ostride * 13] = v1266; + float32x2_t v1276 = vsub_f32(v1275, v1266); + float32x2_t v1285 = vmul_f32(v1234, v1423); + float32x2_t v1373 = vsub_f32(v1372, v1368); + float32x2_t v1405 = vsub_f32(v1368, v1404); + float32x2_t v1414 = vmul_f32(v1368, v1423); + v6[ostride * 6] = v975; + v6[ostride * 16] = v998; + float32x2_t v1008 = vsub_f32(v1007, v975); + v6[ostride * 22] = v1147; + v6[ostride * 8] = v1253; + v6[ostride * 18] = v1276; + float32x2_t v1286 = vsub_f32(v1285, v1253); + float32x2_t v1392 = vsub_f32(v1373, v1391); + v6[ostride * 14] = v1405; + float32x2_t v1415 = vsub_f32(v1414, v1405); + float32x2_t v1424 = vmul_f32(v1373, v1423); + v6[ostride * 21] = v1008; + v6[ostride * 23] = v1286; + v6[ostride * 9] = v1392; + v6[ostride * 19] = v1415; + float32x2_t v1425 = vsub_f32(v1424, v1392); + v6[ostride * 24] = v1425; + v5 += 1 * idist; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu25(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v1016 = 9.6858316112863108e-01F; + float v1021 = 2.4868988716485479e-01F; + float v1178 = 8.7630668004386358e-01F; + float v1183 = 4.8175367410171532e-01F; + float v1340 = 7.2896862742141155e-01F; + float v1345 = 6.8454710592868862e-01F; + float v1353 = 6.2790519529313527e-02F; + float v1358 = 9.9802672842827156e-01F; + float v1391 = 7.7051324277578925e-01F; + float v1502 = 5.3582679497899655e-01F; + float v1507 = 8.4432792550201508e-01F; + float v1515 = -4.2577929156507272e-01F; + float v1520 = 9.0482705246601947e-01F; + float v1528 = -6.3742398974868952e-01F; + float v1533 = -7.7051324277578936e-01F; + float v1548 = -9.9211470131447776e-01F; + float v1553 = 1.2533323356430454e-01F; + float v1570 = 2.5000000000000000e-01F; + float v1582 = 5.5901699437494745e-01F; + float v1594 = 6.1803398874989490e-01F; + float v1623 = -9.5105651629515353e-01F; + float v1651 = 2.0000000000000000e+00F; + const int32_t *v1734 = &v5[v0]; + float32x2_t *v2070 = &v6[v2]; + int64_t v27 = v0 * 5; + int64_t v35 = v0 * 10; + int64_t v43 = v0 * 15; + int64_t v51 = v0 * 20; + int64_t v194 = v0 * 6; + int64_t v202 = v0 * 11; + int64_t v210 = v0 * 16; + int64_t v218 = v0 * 21; + int64_t v353 = v0 * 2; + int64_t v361 = v0 * 7; + int64_t v369 = v0 * 12; + int64_t v377 = v0 * 17; + int64_t v385 = v0 * 22; + int64_t v520 = v0 * 3; + int64_t v528 = v0 * 8; + int64_t v536 = v0 * 13; + int64_t v544 = v0 * 18; + int64_t v552 = v0 * 23; + int64_t v687 = v0 * 4; + int64_t v695 = v0 * 9; + int64_t v703 = v0 * 14; + int64_t v711 = v0 * 19; + int64_t v719 = v0 * 24; + int64_t v968 = v2 * 5; + int64_t v983 = v2 * 10; + int64_t v996 = v2 * 15; + int64_t v1009 = v2 * 20; + float v1024 = v4 * v1021; + int64_t v1130 = v2 * 6; + int64_t v1145 = v2 * 11; + int64_t v1158 = v2 * 16; + int64_t v1171 = v2 * 21; + float v1186 = v4 * v1183; + int64_t v1277 = v2 * 2; + int64_t v1292 = v2 * 7; + int64_t v1307 = v2 * 12; + int64_t v1320 = v2 * 17; + int64_t v1333 = v2 * 22; + float v1348 = v4 * v1345; + float v1361 = v4 * v1358; + float v1394 = v4 * v1391; + int64_t v1439 = v2 * 3; + int64_t v1454 = v2 * 8; + int64_t v1469 = v2 * 13; + int64_t v1482 = v2 * 18; + int64_t v1495 = v2 * 23; + float v1510 = v4 * v1507; + float v1523 = v4 * v1520; + float v1536 = v4 * v1533; + float v1556 = v4 * v1553; + int64_t v1601 = v2 * 4; + int64_t v1616 = v2 * 9; + float v1626 = v4 * v1623; + int64_t v1631 = v2 * 14; + int64_t v1644 = v2 * 19; + int64_t v1657 = v2 * 24; + const int32_t *v1670 = &v5[0]; + svint64_t v1963 = svindex_s64(0, v1); + svfloat32_t v1992 = svdup_n_f32(0); + float32x2_t *v2006 = &v6[0]; + svfloat32_t v2049 = svdup_n_f32(v1016); + svfloat32_t v2113 = svdup_n_f32(v1178); + svfloat32_t v2177 = svdup_n_f32(v1340); + svfloat32_t v2179 = svdup_n_f32(v1353); + svfloat32_t v2241 = svdup_n_f32(v1502); + svfloat32_t v2243 = svdup_n_f32(v1515); + svfloat32_t v2245 = svdup_n_f32(v1528); + svfloat32_t v2248 = svdup_n_f32(v1548); + svfloat32_t v2251 = svdup_n_f32(v1570); + svfloat32_t v2253 = svdup_n_f32(v1582); + svfloat32_t v2255 = svdup_n_f32(v1594); + svfloat32_t v2295 = svdup_n_f32(v1651); + svint16_t v1672 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1670), v1963)); + const int32_t *v1679 = &v5[v27]; + const int32_t *v1688 = &v5[v35]; + const int32_t *v1697 = &v5[v43]; + const int32_t *v1706 = &v5[v51]; + svint16_t v1736 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1734), v1963)); + const int32_t *v1743 = &v5[v194]; + const int32_t *v1752 = &v5[v202]; + const int32_t *v1761 = &v5[v210]; + const int32_t *v1770 = &v5[v218]; + const int32_t *v1798 = &v5[v353]; + const int32_t *v1807 = &v5[v361]; + const int32_t *v1816 = &v5[v369]; + const int32_t *v1825 = &v5[v377]; + const int32_t *v1834 = &v5[v385]; + const int32_t *v1862 = &v5[v520]; + const int32_t *v1871 = &v5[v528]; + const int32_t *v1880 = &v5[v536]; + const int32_t *v1889 = &v5[v544]; + const int32_t *v1898 = &v5[v552]; + const int32_t *v1926 = &v5[v687]; + const int32_t *v1935 = &v5[v695]; + const int32_t *v1944 = &v5[v703]; + const int32_t *v1953 = &v5[v711]; + const int32_t *v1962 = &v5[v719]; + float32x2_t *v2016 = &v6[v968]; + float32x2_t *v2026 = &v6[v983]; + float32x2_t *v2036 = &v6[v996]; + float32x2_t *v2046 = &v6[v1009]; + svfloat32_t v2050 = svdup_n_f32(v1024); + float32x2_t *v2080 = &v6[v1130]; + float32x2_t *v2090 = &v6[v1145]; + float32x2_t *v2100 = &v6[v1158]; + float32x2_t *v2110 = &v6[v1171]; + svfloat32_t v2114 = svdup_n_f32(v1186); + float32x2_t *v2134 = &v6[v1277]; + float32x2_t *v2144 = &v6[v1292]; + float32x2_t *v2154 = &v6[v1307]; + float32x2_t *v2164 = &v6[v1320]; + float32x2_t *v2174 = &v6[v1333]; + svfloat32_t v2178 = svdup_n_f32(v1348); + svfloat32_t v2180 = svdup_n_f32(v1361); + svfloat32_t v2185 = svdup_n_f32(v1394); + float32x2_t *v2198 = &v6[v1439]; + float32x2_t *v2208 = &v6[v1454]; + float32x2_t *v2218 = &v6[v1469]; + float32x2_t *v2228 = &v6[v1482]; + float32x2_t *v2238 = &v6[v1495]; + svfloat32_t v2242 = svdup_n_f32(v1510); + svfloat32_t v2244 = svdup_n_f32(v1523); + svfloat32_t v2246 = svdup_n_f32(v1536); + svfloat32_t v2249 = svdup_n_f32(v1556); + float32x2_t *v2262 = &v6[v1601]; + float32x2_t *v2272 = &v6[v1616]; + svfloat32_t v2275 = svdup_n_f32(v1626); + float32x2_t *v2282 = &v6[v1631]; + float32x2_t *v2292 = &v6[v1644]; + float32x2_t *v2302 = &v6[v1657]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1672, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v192 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1736, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svint16_t v1681 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1679), v1963)); + svint16_t v1690 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1688), v1963)); + svint16_t v1699 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1697), v1963)); + svint16_t v1708 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1706), v1963)); + svint16_t v1745 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1743), v1963)); + svint16_t v1754 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1752), v1963)); + svint16_t v1763 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1761), v1963)); + svint16_t v1772 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1770), v1963)); + svint16_t v1800 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1798), v1963)); + svint16_t v1809 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1807), v1963)); + svint16_t v1818 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1816), v1963)); + svint16_t v1827 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1825), v1963)); + svint16_t v1836 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1834), v1963)); + svint16_t v1864 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1862), v1963)); + svint16_t v1873 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1871), v1963)); + svint16_t v1882 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1880), v1963)); + svint16_t v1891 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1889), v1963)); + svint16_t v1900 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1898), v1963)); + svint16_t v1928 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1926), v1963)); + svint16_t v1937 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1935), v1963)); + svint16_t v1946 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1944), v1963)); + svint16_t v1955 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1953), v1963)); + svint16_t v1964 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1962), v1963)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1681, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v41 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1690, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v49 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1699, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v57 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1708, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v200 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1745, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v208 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1754, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v216 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1763, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v224 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1772, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v359 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1800, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v367 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1809, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v375 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1818, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v383 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1827, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v391 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1836, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v526 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1864, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v534 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1873, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v542 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1882, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v550 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1891, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v558 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1900, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v693 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1928, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v701 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1937, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v709 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1946, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v717 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1955, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v725 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1964, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v70 = svcmla_f32_x(pred_full, v33, v1992, v33, 90); + svfloat32_t v83 = svcmla_f32_x(pred_full, v41, v1992, v41, 90); + svfloat32_t v96 = svcmla_f32_x(pred_full, v57, v1992, v57, 90); + svfloat32_t v116 = svcmla_f32_x(pred_full, v49, v1992, v49, 90); + svfloat32_t v237 = svcmla_f32_x(pred_full, v200, v1992, v200, 90); + svfloat32_t v250 = svcmla_f32_x(pred_full, v208, v1992, v208, 90); + svfloat32_t v263 = svcmla_f32_x(pred_full, v224, v1992, v224, 90); + svfloat32_t v283 = svcmla_f32_x(pred_full, v216, v1992, v216, 90); + svfloat32_t v404 = svcmla_f32_x(pred_full, v367, v1992, v367, 90); + svfloat32_t v417 = svcmla_f32_x(pred_full, v375, v1992, v375, 90); + svfloat32_t v430 = svcmla_f32_x(pred_full, v391, v1992, v391, 90); + svfloat32_t v450 = svcmla_f32_x(pred_full, v383, v1992, v383, 90); + svfloat32_t v571 = svcmla_f32_x(pred_full, v534, v1992, v534, 90); + svfloat32_t v584 = svcmla_f32_x(pred_full, v542, v1992, v542, 90); + svfloat32_t v597 = svcmla_f32_x(pred_full, v558, v1992, v558, 90); + svfloat32_t v617 = svcmla_f32_x(pred_full, v550, v1992, v550, 90); + svfloat32_t v738 = svcmla_f32_x(pred_full, v701, v1992, v701, 90); + svfloat32_t v751 = svcmla_f32_x(pred_full, v709, v1992, v709, 90); + svfloat32_t v764 = svcmla_f32_x(pred_full, v725, v1992, v725, 90); + svfloat32_t v784 = svcmla_f32_x(pred_full, v717, v1992, v717, 90); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v70, v96); + svfloat32_t v117 = svsub_f32_x(svptrue_b32(), v83, v116); + svfloat32_t v264 = svsub_f32_x(svptrue_b32(), v237, v263); + svfloat32_t v284 = svsub_f32_x(svptrue_b32(), v250, v283); + svfloat32_t v431 = svsub_f32_x(svptrue_b32(), v404, v430); + svfloat32_t v451 = svsub_f32_x(svptrue_b32(), v417, v450); + svfloat32_t v598 = svsub_f32_x(svptrue_b32(), v571, v597); + svfloat32_t v618 = svsub_f32_x(svptrue_b32(), v584, v617); + svfloat32_t v765 = svsub_f32_x(svptrue_b32(), v738, v764); + svfloat32_t v785 = svsub_f32_x(svptrue_b32(), v751, v784); + svfloat32_t v103 = svnmls_f32_x(pred_full, v97, v70, v2295); + svfloat32_t v123 = svnmls_f32_x(pred_full, v117, v83, v2295); + svfloat32_t v270 = svnmls_f32_x(pred_full, v264, v237, v2295); + svfloat32_t v290 = svnmls_f32_x(pred_full, v284, v250, v2295); + svfloat32_t v437 = svnmls_f32_x(pred_full, v431, v404, v2295); + svfloat32_t v457 = svnmls_f32_x(pred_full, v451, v417, v2295); + svfloat32_t v604 = svnmls_f32_x(pred_full, v598, v571, v2295); + svfloat32_t v624 = svnmls_f32_x(pred_full, v618, v584, v2295); + svfloat32_t v771 = svnmls_f32_x(pred_full, v765, v738, v2295); + svfloat32_t v791 = svnmls_f32_x(pred_full, v785, v751, v2295); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v103, v123); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v103, v123); + svfloat32_t v137 = svmla_f32_x(pred_full, v97, v117, v2255); + svfloat32_t v155 = svnmls_f32_x(pred_full, v117, v97, v2255); + svfloat32_t v291 = svadd_f32_x(svptrue_b32(), v270, v290); + svfloat32_t v292 = svsub_f32_x(svptrue_b32(), v270, v290); + svfloat32_t v304 = svmla_f32_x(pred_full, v264, v284, v2255); + svfloat32_t v322 = svnmls_f32_x(pred_full, v284, v264, v2255); + svfloat32_t v458 = svadd_f32_x(svptrue_b32(), v437, v457); + svfloat32_t v459 = svsub_f32_x(svptrue_b32(), v437, v457); + svfloat32_t v471 = svmla_f32_x(pred_full, v431, v451, v2255); + svfloat32_t v489 = svnmls_f32_x(pred_full, v451, v431, v2255); + svfloat32_t v625 = svadd_f32_x(svptrue_b32(), v604, v624); + svfloat32_t v626 = svsub_f32_x(svptrue_b32(), v604, v624); + svfloat32_t v638 = svmla_f32_x(pred_full, v598, v618, v2255); + svfloat32_t v656 = svnmls_f32_x(pred_full, v618, v598, v2255); + svfloat32_t v792 = svadd_f32_x(svptrue_b32(), v771, v791); + svfloat32_t v793 = svsub_f32_x(svptrue_b32(), v771, v791); + svfloat32_t v805 = svmla_f32_x(pred_full, v765, v785, v2255); + svfloat32_t v823 = svnmls_f32_x(pred_full, v785, v765, v2255); + svfloat32_t v156 = svadd_f32_x(svptrue_b32(), v25, v124); + svfloat32_t zero163 = svdup_n_f32(0); + svfloat32_t v163 = svcmla_f32_x(pred_full, zero163, v2275, v137, 90); + svfloat32_t zero171 = svdup_n_f32(0); + svfloat32_t v171 = svcmla_f32_x(pred_full, zero171, v2275, v155, 90); + svfloat32_t v323 = svadd_f32_x(svptrue_b32(), v192, v291); + svfloat32_t zero330 = svdup_n_f32(0); + svfloat32_t v330 = svcmla_f32_x(pred_full, zero330, v2275, v304, 90); + svfloat32_t zero338 = svdup_n_f32(0); + svfloat32_t v338 = svcmla_f32_x(pred_full, zero338, v2275, v322, 90); + svfloat32_t v490 = svadd_f32_x(svptrue_b32(), v359, v458); + svfloat32_t zero497 = svdup_n_f32(0); + svfloat32_t v497 = svcmla_f32_x(pred_full, zero497, v2275, v471, 90); + svfloat32_t zero505 = svdup_n_f32(0); + svfloat32_t v505 = svcmla_f32_x(pred_full, zero505, v2275, v489, 90); + svfloat32_t v657 = svadd_f32_x(svptrue_b32(), v526, v625); + svfloat32_t zero664 = svdup_n_f32(0); + svfloat32_t v664 = svcmla_f32_x(pred_full, zero664, v2275, v638, 90); + svfloat32_t zero672 = svdup_n_f32(0); + svfloat32_t v672 = svcmla_f32_x(pred_full, zero672, v2275, v656, 90); + svfloat32_t v824 = svadd_f32_x(svptrue_b32(), v693, v792); + svfloat32_t zero831 = svdup_n_f32(0); + svfloat32_t v831 = svcmla_f32_x(pred_full, zero831, v2275, v805, 90); + svfloat32_t zero839 = svdup_n_f32(0); + svfloat32_t v839 = svcmla_f32_x(pred_full, zero839, v2275, v823, 90); + svfloat32_t v131 = svmls_f32_x(pred_full, v25, v124, v2251); + svfloat32_t v298 = svmls_f32_x(pred_full, v192, v291, v2251); + svfloat32_t v465 = svmls_f32_x(pred_full, v359, v458, v2251); + svfloat32_t v632 = svmls_f32_x(pred_full, v526, v625, v2251); + svfloat32_t v799 = svmls_f32_x(pred_full, v693, v792, v2251); + svfloat32_t v143 = svmls_f32_x(pred_full, v131, v125, v2253); + svfloat32_t v310 = svmls_f32_x(pred_full, v298, v292, v2253); + svfloat32_t v477 = svmls_f32_x(pred_full, v465, v459, v2253); + svfloat32_t v644 = svmls_f32_x(pred_full, v632, v626, v2253); + svfloat32_t v811 = svmls_f32_x(pred_full, v799, v793, v2253); + svfloat32_t v865 = svcmla_f32_x(pred_full, v323, v1992, v323, 90); + svfloat32_t v878 = svcmla_f32_x(pred_full, v490, v1992, v490, 90); + svfloat32_t v891 = svcmla_f32_x(pred_full, v824, v1992, v824, 90); + svfloat32_t v911 = svcmla_f32_x(pred_full, v657, v1992, v657, 90); + svfloat32_t v149 = svnmls_f32_x(pred_full, v143, v131, v2295); + svfloat32_t v172 = svsub_f32_x(svptrue_b32(), v143, v171); + svfloat32_t v316 = svnmls_f32_x(pred_full, v310, v298, v2295); + svfloat32_t v339 = svsub_f32_x(svptrue_b32(), v310, v338); + svfloat32_t v483 = svnmls_f32_x(pred_full, v477, v465, v2295); + svfloat32_t v506 = svsub_f32_x(svptrue_b32(), v477, v505); + svfloat32_t v650 = svnmls_f32_x(pred_full, v644, v632, v2295); + svfloat32_t v673 = svsub_f32_x(svptrue_b32(), v644, v672); + svfloat32_t v817 = svnmls_f32_x(pred_full, v811, v799, v2295); + svfloat32_t v840 = svsub_f32_x(svptrue_b32(), v811, v839); + svfloat32_t v892 = svsub_f32_x(svptrue_b32(), v865, v891); + svfloat32_t v912 = svsub_f32_x(svptrue_b32(), v878, v911); + svfloat32_t v164 = svsub_f32_x(svptrue_b32(), v149, v163); + svfloat32_t v178 = svnmls_f32_x(pred_full, v172, v143, v2295); + svfloat32_t v331 = svsub_f32_x(svptrue_b32(), v316, v330); + svfloat32_t v345 = svnmls_f32_x(pred_full, v339, v310, v2295); + svfloat32_t v498 = svsub_f32_x(svptrue_b32(), v483, v497); + svfloat32_t v512 = svnmls_f32_x(pred_full, v506, v477, v2295); + svfloat32_t v665 = svsub_f32_x(svptrue_b32(), v650, v664); + svfloat32_t v679 = svnmls_f32_x(pred_full, v673, v644, v2295); + svfloat32_t v832 = svsub_f32_x(svptrue_b32(), v817, v831); + svfloat32_t v846 = svnmls_f32_x(pred_full, v840, v811, v2295); + svfloat32_t v898 = svnmls_f32_x(pred_full, v892, v865, v2295); + svfloat32_t v918 = svnmls_f32_x(pred_full, v912, v878, v2295); + svfloat32_t v1181 = svmul_f32_x(svptrue_b32(), v339, v2113); + svfloat32_t v1194 = svmul_f32_x(svptrue_b32(), v506, v2241); + svfloat32_t v1207 = svmul_f32_x(svptrue_b32(), v840, v2243); + svfloat32_t v1227 = svmul_f32_x(svptrue_b32(), v673, v2179); + svfloat32_t v184 = svnmls_f32_x(pred_full, v164, v149, v2295); + svfloat32_t v351 = svnmls_f32_x(pred_full, v331, v316, v2295); + svfloat32_t v518 = svnmls_f32_x(pred_full, v498, v483, v2295); + svfloat32_t v685 = svnmls_f32_x(pred_full, v665, v650, v2295); + svfloat32_t v852 = svnmls_f32_x(pred_full, v832, v817, v2295); + svfloat32_t v919 = svadd_f32_x(svptrue_b32(), v898, v918); + svfloat32_t v920 = svsub_f32_x(svptrue_b32(), v898, v918); + svfloat32_t v932 = svmla_f32_x(pred_full, v892, v912, v2255); + svfloat32_t v950 = svnmls_f32_x(pred_full, v912, v892, v2255); + svfloat32_t v1019 = svmul_f32_x(svptrue_b32(), v331, v2049); + svfloat32_t v1032 = svmul_f32_x(svptrue_b32(), v498, v2113); + svfloat32_t v1045 = svmul_f32_x(svptrue_b32(), v832, v2241); + svfloat32_t v1065 = svmul_f32_x(svptrue_b32(), v665, v2177); + svfloat32_t v1189 = svcmla_f32_x(pred_full, v1181, v2114, v339, 90); + svfloat32_t v1202 = svcmla_f32_x(pred_full, v1194, v2242, v506, 90); + svfloat32_t v1215 = svcmla_f32_x(pred_full, v1207, v2244, v840, 90); + svfloat32_t v1235 = svcmla_f32_x(pred_full, v1227, v2180, v673, 90); + svfloat32_t v1343 = svmul_f32_x(svptrue_b32(), v345, v2177); + svfloat32_t v1356 = svmul_f32_x(svptrue_b32(), v512, v2179); + svfloat32_t v1369 = svmul_f32_x(svptrue_b32(), v846, v2248); + svfloat32_t v1389 = svmul_f32_x(svptrue_b32(), v679, v2245); + svfloat32_t v951 = svadd_f32_x(svptrue_b32(), v156, v919); + svfloat32_t zero965 = svdup_n_f32(0); + svfloat32_t v965 = svcmla_f32_x(pred_full, zero965, v2275, v932, 90); + svfloat32_t zero980 = svdup_n_f32(0); + svfloat32_t v980 = svcmla_f32_x(pred_full, zero980, v2275, v950, 90); + svfloat32_t v1027 = svcmla_f32_x(pred_full, v1019, v2050, v331, 90); + svfloat32_t v1040 = svcmla_f32_x(pred_full, v1032, v2114, v498, 90); + svfloat32_t v1053 = svcmla_f32_x(pred_full, v1045, v2242, v832, 90); + svfloat32_t v1073 = svcmla_f32_x(pred_full, v1065, v2178, v665, 90); + svfloat32_t v1216 = svsub_f32_x(svptrue_b32(), v1189, v1215); + svfloat32_t v1236 = svsub_f32_x(svptrue_b32(), v1202, v1235); + svfloat32_t v1351 = svcmla_f32_x(pred_full, v1343, v2178, v345, 90); + svfloat32_t v1364 = svcmla_f32_x(pred_full, v1356, v2180, v512, 90); + svfloat32_t v1377 = svcmla_f32_x(pred_full, v1369, v2249, v846, 90); + svfloat32_t v1397 = svcmla_f32_x(pred_full, v1389, v2185, v679, 90); + svfloat32_t v1505 = svmul_f32_x(svptrue_b32(), v351, v2241); + svfloat32_t v1518 = svmul_f32_x(svptrue_b32(), v518, v2243); + svfloat32_t v1531 = svmul_f32_x(svptrue_b32(), v852, v2245); + svfloat32_t v1551 = svmul_f32_x(svptrue_b32(), v685, v2248); + svfloat32_t v926 = svmls_f32_x(pred_full, v156, v919, v2251); + svfloat32_t v1054 = svsub_f32_x(svptrue_b32(), v1027, v1053); + svfloat32_t v1074 = svsub_f32_x(svptrue_b32(), v1040, v1073); + svfloat32_t v1222 = svnmls_f32_x(pred_full, v1216, v1189, v2295); + svfloat32_t v1242 = svnmls_f32_x(pred_full, v1236, v1202, v2295); + svfloat32_t v1378 = svsub_f32_x(svptrue_b32(), v1351, v1377); + svfloat32_t v1398 = svsub_f32_x(svptrue_b32(), v1364, v1397); + svfloat32_t v1513 = svcmla_f32_x(pred_full, v1505, v2242, v351, 90); + svfloat32_t v1526 = svcmla_f32_x(pred_full, v1518, v2244, v518, 90); + svfloat32_t v1539 = svcmla_f32_x(pred_full, v1531, v2246, v852, 90); + svfloat32_t v1559 = svcmla_f32_x(pred_full, v1551, v2249, v685, 90); + svst1_f64(pred_full, (double *)(v2006), svreinterpret_f64_f32(v951)); + svfloat32_t v938 = svmls_f32_x(pred_full, v926, v920, v2253); + svfloat32_t v1060 = svnmls_f32_x(pred_full, v1054, v1027, v2295); + svfloat32_t v1080 = svnmls_f32_x(pred_full, v1074, v1040, v2295); + svfloat32_t v1243 = svadd_f32_x(svptrue_b32(), v1222, v1242); + svfloat32_t v1244 = svsub_f32_x(svptrue_b32(), v1222, v1242); + svfloat32_t v1256 = svmla_f32_x(pred_full, v1216, v1236, v2255); + svfloat32_t v1274 = svnmls_f32_x(pred_full, v1236, v1216, v2255); + svfloat32_t v1384 = svnmls_f32_x(pred_full, v1378, v1351, v2295); + svfloat32_t v1404 = svnmls_f32_x(pred_full, v1398, v1364, v2295); + svfloat32_t v1540 = svsub_f32_x(svptrue_b32(), v1513, v1539); + svfloat32_t v1560 = svsub_f32_x(svptrue_b32(), v1526, v1559); + svfloat32_t v944 = svnmls_f32_x(pred_full, v938, v926, v2295); + svfloat32_t v981 = svsub_f32_x(svptrue_b32(), v938, v980); + svfloat32_t v1081 = svadd_f32_x(svptrue_b32(), v1060, v1080); + svfloat32_t v1082 = svsub_f32_x(svptrue_b32(), v1060, v1080); + svfloat32_t v1094 = svmla_f32_x(pred_full, v1054, v1074, v2255); + svfloat32_t v1112 = svnmls_f32_x(pred_full, v1074, v1054, v2255); + svfloat32_t v1275 = svadd_f32_x(svptrue_b32(), v172, v1243); + svfloat32_t zero1289 = svdup_n_f32(0); + svfloat32_t v1289 = svcmla_f32_x(pred_full, zero1289, v2275, v1256, 90); + svfloat32_t zero1304 = svdup_n_f32(0); + svfloat32_t v1304 = svcmla_f32_x(pred_full, zero1304, v2275, v1274, 90); + svfloat32_t v1405 = svadd_f32_x(svptrue_b32(), v1384, v1404); + svfloat32_t v1406 = svsub_f32_x(svptrue_b32(), v1384, v1404); + svfloat32_t v1418 = svmla_f32_x(pred_full, v1378, v1398, v2255); + svfloat32_t v1436 = svnmls_f32_x(pred_full, v1398, v1378, v2255); + svfloat32_t v1546 = svnmls_f32_x(pred_full, v1540, v1513, v2295); + svfloat32_t v1566 = svnmls_f32_x(pred_full, v1560, v1526, v2295); + svfloat32_t v966 = svsub_f32_x(svptrue_b32(), v944, v965); + svfloat32_t v994 = svnmls_f32_x(pred_full, v981, v938, v2295); + svfloat32_t v1113 = svadd_f32_x(svptrue_b32(), v164, v1081); + svfloat32_t zero1127 = svdup_n_f32(0); + svfloat32_t v1127 = svcmla_f32_x(pred_full, zero1127, v2275, v1094, 90); + svfloat32_t zero1142 = svdup_n_f32(0); + svfloat32_t v1142 = svcmla_f32_x(pred_full, zero1142, v2275, v1112, 90); + svfloat32_t v1250 = svmls_f32_x(pred_full, v172, v1243, v2251); + svfloat32_t v1437 = svadd_f32_x(svptrue_b32(), v178, v1405); + svfloat32_t zero1451 = svdup_n_f32(0); + svfloat32_t v1451 = svcmla_f32_x(pred_full, zero1451, v2275, v1418, 90); + svfloat32_t zero1466 = svdup_n_f32(0); + svfloat32_t v1466 = svcmla_f32_x(pred_full, zero1466, v2275, v1436, 90); + svfloat32_t v1567 = svadd_f32_x(svptrue_b32(), v1546, v1566); + svfloat32_t v1568 = svsub_f32_x(svptrue_b32(), v1546, v1566); + svfloat32_t v1580 = svmla_f32_x(pred_full, v1540, v1560, v2255); + svfloat32_t v1598 = svnmls_f32_x(pred_full, v1560, v1540, v2255); + svst1_f64(pred_full, (double *)(v2026), svreinterpret_f64_f32(v981)); + svst1_f64(pred_full, (double *)(v2134), svreinterpret_f64_f32(v1275)); + svfloat32_t v1007 = svnmls_f32_x(pred_full, v966, v944, v2295); + svfloat32_t v1088 = svmls_f32_x(pred_full, v164, v1081, v2251); + svfloat32_t v1262 = svmls_f32_x(pred_full, v1250, v1244, v2253); + svfloat32_t v1412 = svmls_f32_x(pred_full, v178, v1405, v2251); + svfloat32_t v1599 = svadd_f32_x(svptrue_b32(), v184, v1567); + svfloat32_t zero1613 = svdup_n_f32(0); + svfloat32_t v1613 = svcmla_f32_x(pred_full, zero1613, v2275, v1580, 90); + svfloat32_t zero1628 = svdup_n_f32(0); + svfloat32_t v1628 = svcmla_f32_x(pred_full, zero1628, v2275, v1598, 90); + svst1_f64(pred_full, (double *)(v2016), svreinterpret_f64_f32(v966)); + svst1_f64(pred_full, (double *)(v2036), svreinterpret_f64_f32(v994)); + svst1_f64(pred_full, (double *)(v2070), svreinterpret_f64_f32(v1113)); + svst1_f64(pred_full, (double *)(v2198), svreinterpret_f64_f32(v1437)); + svfloat32_t v1100 = svmls_f32_x(pred_full, v1088, v1082, v2253); + svfloat32_t v1268 = svnmls_f32_x(pred_full, v1262, v1250, v2295); + svfloat32_t v1305 = svsub_f32_x(svptrue_b32(), v1262, v1304); + svfloat32_t v1424 = svmls_f32_x(pred_full, v1412, v1406, v2253); + svfloat32_t v1574 = svmls_f32_x(pred_full, v184, v1567, v2251); + svst1_f64(pred_full, (double *)(v2046), svreinterpret_f64_f32(v1007)); + svst1_f64(pred_full, (double *)(v2262), svreinterpret_f64_f32(v1599)); + svfloat32_t v1106 = svnmls_f32_x(pred_full, v1100, v1088, v2295); + svfloat32_t v1143 = svsub_f32_x(svptrue_b32(), v1100, v1142); + svfloat32_t v1290 = svsub_f32_x(svptrue_b32(), v1268, v1289); + svfloat32_t v1318 = svnmls_f32_x(pred_full, v1305, v1262, v2295); + svfloat32_t v1430 = svnmls_f32_x(pred_full, v1424, v1412, v2295); + svfloat32_t v1467 = svsub_f32_x(svptrue_b32(), v1424, v1466); + svfloat32_t v1586 = svmls_f32_x(pred_full, v1574, v1568, v2253); + svst1_f64(pred_full, (double *)(v2154), svreinterpret_f64_f32(v1305)); + svfloat32_t v1128 = svsub_f32_x(svptrue_b32(), v1106, v1127); + svfloat32_t v1156 = svnmls_f32_x(pred_full, v1143, v1100, v2295); + svfloat32_t v1331 = svnmls_f32_x(pred_full, v1290, v1268, v2295); + svfloat32_t v1452 = svsub_f32_x(svptrue_b32(), v1430, v1451); + svfloat32_t v1480 = svnmls_f32_x(pred_full, v1467, v1424, v2295); + svfloat32_t v1592 = svnmls_f32_x(pred_full, v1586, v1574, v2295); + svfloat32_t v1629 = svsub_f32_x(svptrue_b32(), v1586, v1628); + svst1_f64(pred_full, (double *)(v2090), svreinterpret_f64_f32(v1143)); + svst1_f64(pred_full, (double *)(v2144), svreinterpret_f64_f32(v1290)); + svst1_f64(pred_full, (double *)(v2164), svreinterpret_f64_f32(v1318)); + svst1_f64(pred_full, (double *)(v2218), svreinterpret_f64_f32(v1467)); + svfloat32_t v1169 = svnmls_f32_x(pred_full, v1128, v1106, v2295); + svfloat32_t v1493 = svnmls_f32_x(pred_full, v1452, v1430, v2295); + svfloat32_t v1614 = svsub_f32_x(svptrue_b32(), v1592, v1613); + svfloat32_t v1642 = svnmls_f32_x(pred_full, v1629, v1586, v2295); + svst1_f64(pred_full, (double *)(v2080), svreinterpret_f64_f32(v1128)); + svst1_f64(pred_full, (double *)(v2100), svreinterpret_f64_f32(v1156)); + svst1_f64(pred_full, (double *)(v2174), svreinterpret_f64_f32(v1331)); + svst1_f64(pred_full, (double *)(v2208), svreinterpret_f64_f32(v1452)); + svst1_f64(pred_full, (double *)(v2228), svreinterpret_f64_f32(v1480)); + svst1_f64(pred_full, (double *)(v2282), svreinterpret_f64_f32(v1629)); + svfloat32_t v1655 = svnmls_f32_x(pred_full, v1614, v1592, v2295); + svst1_f64(pred_full, (double *)(v2110), svreinterpret_f64_f32(v1169)); + svst1_f64(pred_full, (double *)(v2238), svreinterpret_f64_f32(v1493)); + svst1_f64(pred_full, (double *)(v2272), svreinterpret_f64_f32(v1614)); + svst1_f64(pred_full, (double *)(v2292), svreinterpret_f64_f32(v1642)); + svst1_f64(pred_full, (double *)(v2302), svreinterpret_f64_f32(v1655)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu28(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v100 = vld1s_s16(&v5[istride]); + float v431 = 4.4095855184409838e-01F; + float v438 = 3.4087293062393137e-01F; + float v445 = -5.3396936033772524e-01F; + float v452 = 8.7484229096165667e-01F; + float v495 = 1.0000000000000000e+00F; + float v496 = -1.0000000000000000e+00F; + float v502 = -1.1666666666666665e+00F; + float v503 = 1.1666666666666665e+00F; + float v509 = 7.9015646852540022e-01F; + float v510 = -7.9015646852540022e-01F; + float v516 = 5.5854267289647742e-02F; + float v517 = -5.5854267289647742e-02F; + float v523 = 7.3430220123575241e-01F; + float v524 = -7.3430220123575241e-01F; + float32x2_t v526 = (float32x2_t){v4, v4}; + float v531 = -4.4095855184409838e-01F; + float v535 = -3.4087293062393137e-01F; + float v539 = 5.3396936033772524e-01F; + float v543 = -8.7484229096165667e-01F; + int16x4_t v20 = vld1s_s16(&v5[0]); + float32x2_t v101 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v100)), 15); + float32x2_t v417 = (float32x2_t){v502, v502}; + float32x2_t v421 = (float32x2_t){v509, v509}; + float32x2_t v425 = (float32x2_t){v516, v516}; + float32x2_t v429 = (float32x2_t){v523, v523}; + float32x2_t v433 = (float32x2_t){v431, v531}; + float32x2_t v440 = (float32x2_t){v438, v535}; + float32x2_t v447 = (float32x2_t){v445, v539}; + float32x2_t v454 = (float32x2_t){v452, v543}; + float32x2_t v497 = (float32x2_t){v495, v496}; + float32x2_t v504 = (float32x2_t){v502, v503}; + float32x2_t v511 = (float32x2_t){v509, v510}; + float32x2_t v518 = (float32x2_t){v516, v517}; + float32x2_t v525 = (float32x2_t){v523, v524}; + float32x2_t v532 = (float32x2_t){v531, v531}; + float32x2_t v536 = (float32x2_t){v535, v535}; + float32x2_t v540 = (float32x2_t){v539, v539}; + float32x2_t v544 = (float32x2_t){v543, v543}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v26 = vld1s_s16(&v5[istride * 14]); + int16x4_t v34 = vld1s_s16(&v5[istride * 7]); + int16x4_t v40 = vld1s_s16(&v5[istride * 21]); + int16x4_t v50 = vld1s_s16(&v5[istride * 4]); + int16x4_t v56 = vld1s_s16(&v5[istride * 18]); + int16x4_t v64 = vld1s_s16(&v5[istride * 11]); + int16x4_t v70 = vld1s_s16(&v5[istride * 25]); + int16x4_t v80 = vld1s_s16(&v5[istride * 8]); + int16x4_t v86 = vld1s_s16(&v5[istride * 22]); + int16x4_t v94 = vld1s_s16(&v5[istride * 15]); + int16x4_t v110 = vld1s_s16(&v5[istride * 12]); + int16x4_t v116 = vld1s_s16(&v5[istride * 26]); + int16x4_t v124 = vld1s_s16(&v5[istride * 19]); + int16x4_t v130 = vld1s_s16(&v5[istride * 5]); + int16x4_t v140 = vld1s_s16(&v5[istride * 16]); + int16x4_t v146 = vld1s_s16(&v5[istride * 2]); + int16x4_t v154 = vld1s_s16(&v5[istride * 23]); + int16x4_t v160 = vld1s_s16(&v5[istride * 9]); + int16x4_t v170 = vld1s_s16(&v5[istride * 20]); + int16x4_t v176 = vld1s_s16(&v5[istride * 6]); + int16x4_t v184 = vld1s_s16(&v5[istride * 27]); + int16x4_t v190 = vld1s_s16(&v5[istride * 13]); + int16x4_t v200 = vld1s_s16(&v5[istride * 24]); + int16x4_t v206 = vld1s_s16(&v5[istride * 10]); + int16x4_t v214 = vld1s_s16(&v5[istride * 3]); + int16x4_t v220 = vld1s_s16(&v5[istride * 17]); + float32x2_t v435 = vmul_f32(v526, v433); + float32x2_t v442 = vmul_f32(v526, v440); + float32x2_t v449 = vmul_f32(v526, v447); + float32x2_t v456 = vmul_f32(v526, v454); + float32x2_t v499 = vmul_f32(v526, v497); + float32x2_t v506 = vmul_f32(v526, v504); + float32x2_t v513 = vmul_f32(v526, v511); + float32x2_t v520 = vmul_f32(v526, v518); + float32x2_t v527 = vmul_f32(v526, v525); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v51 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v50)), 15); + float32x2_t v57 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v56)), 15); + float32x2_t v65 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v64)), 15); + float32x2_t v71 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v70)), 15); + float32x2_t v81 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v80)), 15); + float32x2_t v87 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v86)), 15); + float32x2_t v95 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v94)), 15); + float32x2_t v111 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v110)), 15); + float32x2_t v117 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v116)), 15); + float32x2_t v125 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v124)), 15); + float32x2_t v131 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v130)), 15); + float32x2_t v141 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v140)), 15); + float32x2_t v147 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v146)), 15); + float32x2_t v155 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v154)), 15); + float32x2_t v161 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v160)), 15); + float32x2_t v171 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v170)), 15); + float32x2_t v177 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v176)), 15); + float32x2_t v185 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v184)), 15); + float32x2_t v191 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v190)), 15); + float32x2_t v201 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v200)), 15); + float32x2_t v207 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v206)), 15); + float32x2_t v215 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v214)), 15); + float32x2_t v221 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v220)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v58 = vadd_f32(v51, v57); + float32x2_t v59 = vsub_f32(v51, v57); + float32x2_t v72 = vadd_f32(v65, v71); + float32x2_t v73 = vsub_f32(v65, v71); + float32x2_t v88 = vadd_f32(v81, v87); + float32x2_t v89 = vsub_f32(v81, v87); + float32x2_t v102 = vadd_f32(v95, v101); + float32x2_t v103 = vsub_f32(v95, v101); + float32x2_t v118 = vadd_f32(v111, v117); + float32x2_t v119 = vsub_f32(v111, v117); + float32x2_t v132 = vadd_f32(v125, v131); + float32x2_t v133 = vsub_f32(v125, v131); + float32x2_t v148 = vadd_f32(v141, v147); + float32x2_t v149 = vsub_f32(v141, v147); + float32x2_t v162 = vadd_f32(v155, v161); + float32x2_t v163 = vsub_f32(v155, v161); + float32x2_t v178 = vadd_f32(v171, v177); + float32x2_t v179 = vsub_f32(v171, v177); + float32x2_t v192 = vadd_f32(v185, v191); + float32x2_t v193 = vsub_f32(v185, v191); + float32x2_t v208 = vadd_f32(v201, v207); + float32x2_t v209 = vsub_f32(v201, v207); + float32x2_t v222 = vadd_f32(v215, v221); + float32x2_t v223 = vsub_f32(v215, v221); + float32x2_t v44 = vadd_f32(v28, v42); + float32x2_t v45 = vsub_f32(v28, v42); + float32x2_t v74 = vadd_f32(v58, v72); + float32x2_t v75 = vsub_f32(v58, v72); + float32x2_t v104 = vadd_f32(v88, v102); + float32x2_t v105 = vsub_f32(v88, v102); + float32x2_t v134 = vadd_f32(v118, v132); + float32x2_t v135 = vsub_f32(v118, v132); + float32x2_t v164 = vadd_f32(v148, v162); + float32x2_t v165 = vsub_f32(v148, v162); + float32x2_t v194 = vadd_f32(v178, v192); + float32x2_t v195 = vsub_f32(v178, v192); + float32x2_t v224 = vadd_f32(v208, v222); + float32x2_t v225 = vsub_f32(v208, v222); + float32x2_t v394 = vadd_f32(v59, v209); + float32x2_t v395 = vsub_f32(v59, v209); + float32x2_t v396 = vadd_f32(v149, v119); + float32x2_t v397 = vsub_f32(v149, v119); + float32x2_t v398 = vadd_f32(v89, v179); + float32x2_t v399 = vsub_f32(v89, v179); + float32x2_t v478 = vadd_f32(v73, v223); + float32x2_t v479 = vsub_f32(v73, v223); + float32x2_t v480 = vadd_f32(v163, v133); + float32x2_t v481 = vsub_f32(v163, v133); + float32x2_t v482 = vadd_f32(v103, v193); + float32x2_t v483 = vsub_f32(v103, v193); + float32x2_t v226 = vadd_f32(v74, v224); + float32x2_t v227 = vsub_f32(v74, v224); + float32x2_t v228 = vadd_f32(v164, v134); + float32x2_t v229 = vsub_f32(v164, v134); + float32x2_t v230 = vadd_f32(v104, v194); + float32x2_t v231 = vsub_f32(v104, v194); + float32x2_t v310 = vadd_f32(v75, v225); + float32x2_t v311 = vsub_f32(v75, v225); + float32x2_t v312 = vadd_f32(v165, v135); + float32x2_t v313 = vsub_f32(v165, v135); + float32x2_t v314 = vadd_f32(v105, v195); + float32x2_t v315 = vsub_f32(v105, v195); + float32x2_t v400 = vadd_f32(v394, v396); + float32x2_t v403 = vsub_f32(v394, v396); + float32x2_t v404 = vsub_f32(v396, v398); + float32x2_t v405 = vsub_f32(v398, v394); + float32x2_t v406 = vadd_f32(v395, v397); + float32x2_t v408 = vsub_f32(v395, v397); + float32x2_t v409 = vsub_f32(v397, v399); + float32x2_t v410 = vsub_f32(v399, v395); + float32x2_t v484 = vadd_f32(v478, v480); + float32x2_t v487 = vsub_f32(v478, v480); + float32x2_t v488 = vsub_f32(v480, v482); + float32x2_t v489 = vsub_f32(v482, v478); + float32x2_t v490 = vadd_f32(v479, v481); + float32x2_t v492 = vsub_f32(v479, v481); + float32x2_t v493 = vsub_f32(v481, v483); + float32x2_t v494 = vsub_f32(v483, v479); + float32x2_t v232 = vadd_f32(v226, v228); + float32x2_t v235 = vsub_f32(v226, v228); + float32x2_t v236 = vsub_f32(v228, v230); + float32x2_t v237 = vsub_f32(v230, v226); + float32x2_t v238 = vadd_f32(v227, v229); + float32x2_t v240 = vsub_f32(v227, v229); + float32x2_t v241 = vsub_f32(v229, v231); + float32x2_t v242 = vsub_f32(v231, v227); + float32x2_t v316 = vadd_f32(v310, v312); + float32x2_t v319 = vsub_f32(v310, v312); + float32x2_t v320 = vsub_f32(v312, v314); + float32x2_t v321 = vsub_f32(v314, v310); + float32x2_t v322 = vadd_f32(v311, v313); + float32x2_t v324 = vsub_f32(v311, v313); + float32x2_t v325 = vsub_f32(v313, v315); + float32x2_t v326 = vsub_f32(v315, v311); + float32x2_t v401 = vadd_f32(v400, v398); + float32x2_t v407 = vadd_f32(v406, v399); + float32x2_t v422 = vmul_f32(v403, v421); + float32x2_t v426 = vmul_f32(v404, v425); + float32x2_t v430 = vmul_f32(v405, v429); + float32x2_t v443 = vrev64_f32(v408); + float32x2_t v450 = vrev64_f32(v409); + float32x2_t v457 = vrev64_f32(v410); + float32x2_t v485 = vadd_f32(v484, v482); + float32x2_t v491 = vadd_f32(v490, v483); + float32x2_t v514 = vrev64_f32(v487); + float32x2_t v521 = vrev64_f32(v488); + float32x2_t v528 = vrev64_f32(v489); + float32x2_t v537 = vmul_f32(v492, v536); + float32x2_t v541 = vmul_f32(v493, v540); + float32x2_t v545 = vmul_f32(v494, v544); + float32x2_t v233 = vadd_f32(v232, v230); + float32x2_t v239 = vadd_f32(v238, v231); + float32x2_t v254 = vmul_f32(v235, v421); + float32x2_t v258 = vmul_f32(v236, v425); + float32x2_t v262 = vmul_f32(v237, v429); + float32x2_t v275 = vrev64_f32(v240); + float32x2_t v282 = vrev64_f32(v241); + float32x2_t v289 = vrev64_f32(v242); + float32x2_t v317 = vadd_f32(v316, v314); + float32x2_t v323 = vadd_f32(v322, v315); + float32x2_t v338 = vmul_f32(v319, v421); + float32x2_t v342 = vmul_f32(v320, v425); + float32x2_t v346 = vmul_f32(v321, v429); + float32x2_t v359 = vrev64_f32(v324); + float32x2_t v366 = vrev64_f32(v325); + float32x2_t v373 = vrev64_f32(v326); + float32x2_t v402 = vadd_f32(v401, v29); + float32x2_t v418 = vmul_f32(v401, v417); + float32x2_t v436 = vrev64_f32(v407); + float32x2_t v444 = vmul_f32(v443, v442); + float32x2_t v451 = vmul_f32(v450, v449); + float32x2_t v458 = vmul_f32(v457, v456); + float32x2_t v486 = vadd_f32(v485, v43); + float32x2_t v507 = vrev64_f32(v485); + float32x2_t v515 = vmul_f32(v514, v513); + float32x2_t v522 = vmul_f32(v521, v520); + float32x2_t v529 = vmul_f32(v528, v527); + float32x2_t v533 = vmul_f32(v491, v532); + float32x2_t v234 = vadd_f32(v233, v44); + float32x2_t v250 = vmul_f32(v233, v417); + float32x2_t v268 = vrev64_f32(v239); + float32x2_t v276 = vmul_f32(v275, v442); + float32x2_t v283 = vmul_f32(v282, v449); + float32x2_t v290 = vmul_f32(v289, v456); + float32x2_t v318 = vadd_f32(v317, v45); + float32x2_t v334 = vmul_f32(v317, v417); + float32x2_t v352 = vrev64_f32(v323); + float32x2_t v360 = vmul_f32(v359, v442); + float32x2_t v367 = vmul_f32(v366, v449); + float32x2_t v374 = vmul_f32(v373, v456); + float32x2_t v437 = vmul_f32(v436, v435); + float32x2_t v459 = vadd_f32(v402, v418); + float32x2_t v500 = vrev64_f32(v486); + float32x2_t v508 = vmul_f32(v507, v506); + float32x2_t v553 = vadd_f32(v533, v537); + float32x2_t v555 = vsub_f32(v533, v537); + float32x2_t v557 = vsub_f32(v533, v541); + float32x2_t v269 = vmul_f32(v268, v435); + float32x2_t v291 = vadd_f32(v234, v250); + float32x2_t v353 = vmul_f32(v352, v435); + float32x2_t v375 = vadd_f32(v318, v334); + float32x2_t v460 = vadd_f32(v459, v422); + float32x2_t v462 = vsub_f32(v459, v422); + float32x2_t v464 = vsub_f32(v459, v426); + float32x2_t v466 = vadd_f32(v437, v444); + float32x2_t v468 = vsub_f32(v437, v444); + float32x2_t v470 = vsub_f32(v437, v451); + float32x2_t v501 = vmul_f32(v500, v499); + float32x2_t v554 = vadd_f32(v553, v541); + float32x2_t v556 = vsub_f32(v555, v545); + float32x2_t v558 = vadd_f32(v557, v545); + v6[0] = v234; + v6[ostride * 14] = v318; + float32x2_t v292 = vadd_f32(v291, v254); + float32x2_t v294 = vsub_f32(v291, v254); + float32x2_t v296 = vsub_f32(v291, v258); + float32x2_t v298 = vadd_f32(v269, v276); + float32x2_t v300 = vsub_f32(v269, v276); + float32x2_t v302 = vsub_f32(v269, v283); + float32x2_t v376 = vadd_f32(v375, v338); + float32x2_t v378 = vsub_f32(v375, v338); + float32x2_t v380 = vsub_f32(v375, v342); + float32x2_t v382 = vadd_f32(v353, v360); + float32x2_t v384 = vsub_f32(v353, v360); + float32x2_t v386 = vsub_f32(v353, v367); + float32x2_t v461 = vadd_f32(v460, v426); + float32x2_t v463 = vsub_f32(v462, v430); + float32x2_t v465 = vadd_f32(v464, v430); + float32x2_t v467 = vadd_f32(v466, v451); + float32x2_t v469 = vsub_f32(v468, v458); + float32x2_t v471 = vadd_f32(v470, v458); + float32x2_t v546 = vadd_f32(v501, v508); + float32x2_t v565 = vadd_f32(v402, v501); + float32x2_t v566 = vsub_f32(v402, v501); + float32x2_t v293 = vadd_f32(v292, v258); + float32x2_t v295 = vsub_f32(v294, v262); + float32x2_t v297 = vadd_f32(v296, v262); + float32x2_t v299 = vadd_f32(v298, v283); + float32x2_t v301 = vsub_f32(v300, v290); + float32x2_t v303 = vadd_f32(v302, v290); + float32x2_t v377 = vadd_f32(v376, v342); + float32x2_t v379 = vsub_f32(v378, v346); + float32x2_t v381 = vadd_f32(v380, v346); + float32x2_t v383 = vadd_f32(v382, v367); + float32x2_t v385 = vsub_f32(v384, v374); + float32x2_t v387 = vadd_f32(v386, v374); + float32x2_t v472 = vadd_f32(v461, v467); + float32x2_t v473 = vsub_f32(v461, v467); + float32x2_t v474 = vadd_f32(v463, v469); + float32x2_t v475 = vsub_f32(v463, v469); + float32x2_t v476 = vadd_f32(v465, v471); + float32x2_t v477 = vsub_f32(v465, v471); + float32x2_t v547 = vadd_f32(v546, v515); + float32x2_t v549 = vsub_f32(v546, v515); + float32x2_t v551 = vsub_f32(v546, v522); + v6[ostride * 21] = v566; + v6[ostride * 7] = v565; + float32x2_t v304 = vadd_f32(v293, v299); + float32x2_t v305 = vsub_f32(v293, v299); + float32x2_t v306 = vadd_f32(v295, v301); + float32x2_t v307 = vsub_f32(v295, v301); + float32x2_t v308 = vadd_f32(v297, v303); + float32x2_t v309 = vsub_f32(v297, v303); + float32x2_t v388 = vadd_f32(v377, v383); + float32x2_t v389 = vsub_f32(v377, v383); + float32x2_t v390 = vadd_f32(v379, v385); + float32x2_t v391 = vsub_f32(v379, v385); + float32x2_t v392 = vadd_f32(v381, v387); + float32x2_t v393 = vsub_f32(v381, v387); + float32x2_t v548 = vadd_f32(v547, v522); + float32x2_t v550 = vsub_f32(v549, v529); + float32x2_t v552 = vadd_f32(v551, v529); + float32x2_t v559 = vadd_f32(v548, v554); + float32x2_t v560 = vsub_f32(v548, v554); + float32x2_t v561 = vadd_f32(v550, v556); + float32x2_t v562 = vsub_f32(v550, v556); + float32x2_t v563 = vadd_f32(v552, v558); + float32x2_t v564 = vsub_f32(v552, v558); + v6[ostride * 8] = v305; + v6[ostride * 22] = v389; + v6[ostride * 16] = v307; + v6[ostride * 2] = v391; + v6[ostride * 24] = v308; + v6[ostride * 10] = v392; + v6[ostride * 4] = v309; + v6[ostride * 18] = v393; + v6[ostride * 12] = v306; + v6[ostride * 26] = v390; + v6[ostride * 20] = v304; + v6[ostride * 6] = v388; + float32x2_t v587 = vadd_f32(v473, v560); + float32x2_t v588 = vsub_f32(v473, v560); + float32x2_t v609 = vadd_f32(v475, v562); + float32x2_t v610 = vsub_f32(v475, v562); + float32x2_t v631 = vadd_f32(v476, v563); + float32x2_t v632 = vsub_f32(v476, v563); + float32x2_t v653 = vadd_f32(v477, v564); + float32x2_t v654 = vsub_f32(v477, v564); + float32x2_t v675 = vadd_f32(v474, v561); + float32x2_t v676 = vsub_f32(v474, v561); + float32x2_t v697 = vadd_f32(v472, v559); + float32x2_t v698 = vsub_f32(v472, v559); + v6[ostride] = v588; + v6[ostride * 15] = v587; + v6[ostride * 9] = v610; + v6[ostride * 23] = v609; + v6[ostride * 17] = v632; + v6[ostride * 3] = v631; + v6[ostride * 25] = v654; + v6[ostride * 11] = v653; + v6[ostride * 5] = v676; + v6[ostride * 19] = v675; + v6[ostride * 13] = v698; + v6[ostride * 27] = v697; + v5 += 1 * idist; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu28(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v485 = -1.1666666666666665e+00F; + float v490 = 7.9015646852540022e-01F; + float v495 = 5.5854267289647742e-02F; + float v500 = 7.3430220123575241e-01F; + float v569 = -1.0000000000000000e+00F; + float v576 = 1.1666666666666665e+00F; + float v583 = -7.9015646852540022e-01F; + float v590 = -5.5854267289647742e-02F; + float v597 = -7.3430220123575241e-01F; + float v604 = -4.4095855184409838e-01F; + float v609 = -3.4087293062393137e-01F; + float v614 = 5.3396936033772524e-01F; + float v619 = -8.7484229096165667e-01F; + const int32_t *v958 = &v5[v0]; + float32x2_t *v1193 = &v6[v2]; + int64_t v27 = v0 * 14; + int64_t v37 = v0 * 7; + int64_t v45 = v0 * 21; + int64_t v57 = v0 * 4; + int64_t v65 = v0 * 18; + int64_t v75 = v0 * 11; + int64_t v83 = v0 * 25; + int64_t v95 = v0 * 8; + int64_t v103 = v0 * 22; + int64_t v113 = v0 * 15; + int64_t v133 = v0 * 12; + int64_t v141 = v0 * 26; + int64_t v151 = v0 * 19; + int64_t v159 = v0 * 5; + int64_t v171 = v0 * 16; + int64_t v179 = v0 * 2; + int64_t v189 = v0 * 23; + int64_t v197 = v0 * 9; + int64_t v209 = v0 * 20; + int64_t v217 = v0 * 6; + int64_t v227 = v0 * 27; + int64_t v235 = v0 * 13; + int64_t v247 = v0 * 24; + int64_t v255 = v0 * 10; + int64_t v265 = v0 * 3; + int64_t v273 = v0 * 17; + float v508 = v4 * v604; + float v515 = v4 * v609; + float v522 = v4 * v614; + float v529 = v4 * v619; + float v572 = v4 * v569; + float v579 = v4 * v576; + float v586 = v4 * v583; + float v593 = v4 * v590; + float v600 = v4 * v597; + int64_t v652 = v2 * 21; + int64_t v659 = v2 * 14; + int64_t v666 = v2 * 7; + int64_t v675 = v2 * 8; + int64_t v689 = v2 * 22; + int64_t v696 = v2 * 15; + int64_t v705 = v2 * 16; + int64_t v712 = v2 * 9; + int64_t v719 = v2 * 2; + int64_t v726 = v2 * 23; + int64_t v735 = v2 * 24; + int64_t v742 = v2 * 17; + int64_t v749 = v2 * 10; + int64_t v756 = v2 * 3; + int64_t v765 = v2 * 4; + int64_t v772 = v2 * 25; + int64_t v779 = v2 * 18; + int64_t v786 = v2 * 11; + int64_t v795 = v2 * 12; + int64_t v802 = v2 * 5; + int64_t v809 = v2 * 26; + int64_t v816 = v2 * 19; + int64_t v825 = v2 * 20; + int64_t v832 = v2 * 13; + int64_t v839 = v2 * 6; + int64_t v846 = v2 * 27; + const int32_t *v859 = &v5[0]; + svint64_t v1103 = svindex_s64(0, v1); + svfloat32_t v1124 = svdup_n_f32(v485); + svfloat32_t v1125 = svdup_n_f32(v490); + svfloat32_t v1126 = svdup_n_f32(v495); + svfloat32_t v1127 = svdup_n_f32(v500); + svfloat32_t v1137 = svdup_n_f32(v604); + svfloat32_t v1138 = svdup_n_f32(v609); + svfloat32_t v1139 = svdup_n_f32(v614); + svfloat32_t v1140 = svdup_n_f32(v619); + float32x2_t *v1148 = &v6[0]; + svint16_t v861 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v859), v1103)); + const int32_t *v868 = &v5[v27]; + const int32_t *v877 = &v5[v37]; + const int32_t *v886 = &v5[v45]; + const int32_t *v895 = &v5[v57]; + const int32_t *v904 = &v5[v65]; + const int32_t *v913 = &v5[v75]; + const int32_t *v922 = &v5[v83]; + const int32_t *v931 = &v5[v95]; + const int32_t *v940 = &v5[v103]; + const int32_t *v949 = &v5[v113]; + svint16_t v960 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v958), v1103)); + const int32_t *v967 = &v5[v133]; + const int32_t *v976 = &v5[v141]; + const int32_t *v985 = &v5[v151]; + const int32_t *v994 = &v5[v159]; + const int32_t *v1003 = &v5[v171]; + const int32_t *v1012 = &v5[v179]; + const int32_t *v1021 = &v5[v189]; + const int32_t *v1030 = &v5[v197]; + const int32_t *v1039 = &v5[v209]; + const int32_t *v1048 = &v5[v217]; + const int32_t *v1057 = &v5[v227]; + const int32_t *v1066 = &v5[v235]; + const int32_t *v1075 = &v5[v247]; + const int32_t *v1084 = &v5[v255]; + const int32_t *v1093 = &v5[v265]; + const int32_t *v1102 = &v5[v273]; + svfloat32_t v1128 = svdup_n_f32(v508); + svfloat32_t v1129 = svdup_n_f32(v515); + svfloat32_t v1130 = svdup_n_f32(v522); + svfloat32_t v1131 = svdup_n_f32(v529); + svfloat32_t v1132 = svdup_n_f32(v572); + svfloat32_t v1133 = svdup_n_f32(v579); + svfloat32_t v1134 = svdup_n_f32(v586); + svfloat32_t v1135 = svdup_n_f32(v593); + svfloat32_t v1136 = svdup_n_f32(v600); + float32x2_t *v1157 = &v6[v652]; + float32x2_t *v1166 = &v6[v659]; + float32x2_t *v1175 = &v6[v666]; + float32x2_t *v1184 = &v6[v675]; + float32x2_t *v1202 = &v6[v689]; + float32x2_t *v1211 = &v6[v696]; + float32x2_t *v1220 = &v6[v705]; + float32x2_t *v1229 = &v6[v712]; + float32x2_t *v1238 = &v6[v719]; + float32x2_t *v1247 = &v6[v726]; + float32x2_t *v1256 = &v6[v735]; + float32x2_t *v1265 = &v6[v742]; + float32x2_t *v1274 = &v6[v749]; + float32x2_t *v1283 = &v6[v756]; + float32x2_t *v1292 = &v6[v765]; + float32x2_t *v1301 = &v6[v772]; + float32x2_t *v1310 = &v6[v779]; + float32x2_t *v1319 = &v6[v786]; + float32x2_t *v1328 = &v6[v795]; + float32x2_t *v1337 = &v6[v802]; + float32x2_t *v1346 = &v6[v809]; + float32x2_t *v1355 = &v6[v816]; + float32x2_t *v1364 = &v6[v825]; + float32x2_t *v1373 = &v6[v832]; + float32x2_t *v1382 = &v6[v839]; + float32x2_t *v1391 = &v6[v846]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v861, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v127 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v960, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svint16_t v870 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v868), v1103)); + svint16_t v879 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v877), v1103)); + svint16_t v888 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v886), v1103)); + svint16_t v897 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v895), v1103)); + svint16_t v906 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v904), v1103)); + svint16_t v915 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v913), v1103)); + svint16_t v924 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v922), v1103)); + svint16_t v933 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v931), v1103)); + svint16_t v942 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v940), v1103)); + svint16_t v951 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v949), v1103)); + svint16_t v969 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v967), v1103)); + svint16_t v978 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v976), v1103)); + svint16_t v987 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v985), v1103)); + svint16_t v996 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v994), v1103)); + svint16_t v1005 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1003), v1103)); + svint16_t v1014 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1012), v1103)); + svint16_t v1023 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1021), v1103)); + svint16_t v1032 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1030), v1103)); + svint16_t v1041 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1039), v1103)); + svint16_t v1050 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1048), v1103)); + svint16_t v1059 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1057), v1103)); + svint16_t v1068 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1066), v1103)); + svint16_t v1077 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1075), v1103)); + svint16_t v1086 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1084), v1103)); + svint16_t v1095 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1093), v1103)); + svint16_t v1104 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1102), v1103)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v870, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v879, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v51 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v888, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v63 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v897, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v71 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v906, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v81 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v915, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v89 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v924, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v101 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v933, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v109 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v942, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v119 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v951, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v139 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v969, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v147 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v978, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v157 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v987, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v165 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v996, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v177 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1005, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v185 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1014, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v195 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1023, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v203 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1032, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v215 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1041, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v223 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1050, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v233 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1059, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v241 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1068, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v253 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1077, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v261 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1086, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v271 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1095, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v279 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1104, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v72 = svadd_f32_x(svptrue_b32(), v63, v71); + svfloat32_t v73 = svsub_f32_x(svptrue_b32(), v63, v71); + svfloat32_t v90 = svadd_f32_x(svptrue_b32(), v81, v89); + svfloat32_t v91 = svsub_f32_x(svptrue_b32(), v81, v89); + svfloat32_t v110 = svadd_f32_x(svptrue_b32(), v101, v109); + svfloat32_t v111 = svsub_f32_x(svptrue_b32(), v101, v109); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v119, v127); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v119, v127); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v139, v147); + svfloat32_t v149 = svsub_f32_x(svptrue_b32(), v139, v147); + svfloat32_t v166 = svadd_f32_x(svptrue_b32(), v157, v165); + svfloat32_t v167 = svsub_f32_x(svptrue_b32(), v157, v165); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v177, v185); + svfloat32_t v187 = svsub_f32_x(svptrue_b32(), v177, v185); + svfloat32_t v204 = svadd_f32_x(svptrue_b32(), v195, v203); + svfloat32_t v205 = svsub_f32_x(svptrue_b32(), v195, v203); + svfloat32_t v224 = svadd_f32_x(svptrue_b32(), v215, v223); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v215, v223); + svfloat32_t v242 = svadd_f32_x(svptrue_b32(), v233, v241); + svfloat32_t v243 = svsub_f32_x(svptrue_b32(), v233, v241); + svfloat32_t v262 = svadd_f32_x(svptrue_b32(), v253, v261); + svfloat32_t v263 = svsub_f32_x(svptrue_b32(), v253, v261); + svfloat32_t v280 = svadd_f32_x(svptrue_b32(), v271, v279); + svfloat32_t v281 = svsub_f32_x(svptrue_b32(), v271, v279); + svfloat32_t v54 = svadd_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v55 = svsub_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v92 = svadd_f32_x(svptrue_b32(), v72, v90); + svfloat32_t v93 = svsub_f32_x(svptrue_b32(), v72, v90); + svfloat32_t v130 = svadd_f32_x(svptrue_b32(), v110, v128); + svfloat32_t v131 = svsub_f32_x(svptrue_b32(), v110, v128); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v148, v166); + svfloat32_t v169 = svsub_f32_x(svptrue_b32(), v148, v166); + svfloat32_t v206 = svadd_f32_x(svptrue_b32(), v186, v204); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v186, v204); + svfloat32_t v244 = svadd_f32_x(svptrue_b32(), v224, v242); + svfloat32_t v245 = svsub_f32_x(svptrue_b32(), v224, v242); + svfloat32_t v282 = svadd_f32_x(svptrue_b32(), v262, v280); + svfloat32_t v283 = svsub_f32_x(svptrue_b32(), v262, v280); + svfloat32_t v462 = svadd_f32_x(svptrue_b32(), v73, v263); + svfloat32_t v463 = svsub_f32_x(svptrue_b32(), v73, v263); + svfloat32_t v464 = svadd_f32_x(svptrue_b32(), v187, v149); + svfloat32_t v465 = svsub_f32_x(svptrue_b32(), v187, v149); + svfloat32_t v466 = svadd_f32_x(svptrue_b32(), v111, v225); + svfloat32_t v467 = svsub_f32_x(svptrue_b32(), v111, v225); + svfloat32_t v551 = svadd_f32_x(svptrue_b32(), v91, v281); + svfloat32_t v552 = svsub_f32_x(svptrue_b32(), v91, v281); + svfloat32_t v553 = svadd_f32_x(svptrue_b32(), v205, v167); + svfloat32_t v554 = svsub_f32_x(svptrue_b32(), v205, v167); + svfloat32_t v555 = svadd_f32_x(svptrue_b32(), v129, v243); + svfloat32_t v556 = svsub_f32_x(svptrue_b32(), v129, v243); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v92, v282); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v92, v282); + svfloat32_t v286 = svadd_f32_x(svptrue_b32(), v206, v168); + svfloat32_t v287 = svsub_f32_x(svptrue_b32(), v206, v168); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v130, v244); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v130, v244); + svfloat32_t v373 = svadd_f32_x(svptrue_b32(), v93, v283); + svfloat32_t v374 = svsub_f32_x(svptrue_b32(), v93, v283); + svfloat32_t v375 = svadd_f32_x(svptrue_b32(), v207, v169); + svfloat32_t v376 = svsub_f32_x(svptrue_b32(), v207, v169); + svfloat32_t v377 = svadd_f32_x(svptrue_b32(), v131, v245); + svfloat32_t v378 = svsub_f32_x(svptrue_b32(), v131, v245); + svfloat32_t v468 = svadd_f32_x(svptrue_b32(), v462, v464); + svfloat32_t v471 = svsub_f32_x(svptrue_b32(), v462, v464); + svfloat32_t v472 = svsub_f32_x(svptrue_b32(), v464, v466); + svfloat32_t v473 = svsub_f32_x(svptrue_b32(), v466, v462); + svfloat32_t v474 = svadd_f32_x(svptrue_b32(), v463, v465); + svfloat32_t v476 = svsub_f32_x(svptrue_b32(), v463, v465); + svfloat32_t v477 = svsub_f32_x(svptrue_b32(), v465, v467); + svfloat32_t v478 = svsub_f32_x(svptrue_b32(), v467, v463); + svfloat32_t v557 = svadd_f32_x(svptrue_b32(), v551, v553); + svfloat32_t v560 = svsub_f32_x(svptrue_b32(), v551, v553); + svfloat32_t v561 = svsub_f32_x(svptrue_b32(), v553, v555); + svfloat32_t v562 = svsub_f32_x(svptrue_b32(), v555, v551); + svfloat32_t v563 = svadd_f32_x(svptrue_b32(), v552, v554); + svfloat32_t v565 = svsub_f32_x(svptrue_b32(), v552, v554); + svfloat32_t v566 = svsub_f32_x(svptrue_b32(), v554, v556); + svfloat32_t v567 = svsub_f32_x(svptrue_b32(), v556, v552); + svfloat32_t v290 = svadd_f32_x(svptrue_b32(), v284, v286); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v284, v286); + svfloat32_t v294 = svsub_f32_x(svptrue_b32(), v286, v288); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v288, v284); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v285, v287); + svfloat32_t v298 = svsub_f32_x(svptrue_b32(), v285, v287); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v287, v289); + svfloat32_t v300 = svsub_f32_x(svptrue_b32(), v289, v285); + svfloat32_t v379 = svadd_f32_x(svptrue_b32(), v373, v375); + svfloat32_t v382 = svsub_f32_x(svptrue_b32(), v373, v375); + svfloat32_t v383 = svsub_f32_x(svptrue_b32(), v375, v377); + svfloat32_t v384 = svsub_f32_x(svptrue_b32(), v377, v373); + svfloat32_t v385 = svadd_f32_x(svptrue_b32(), v374, v376); + svfloat32_t v387 = svsub_f32_x(svptrue_b32(), v374, v376); + svfloat32_t v388 = svsub_f32_x(svptrue_b32(), v376, v378); + svfloat32_t v389 = svsub_f32_x(svptrue_b32(), v378, v374); + svfloat32_t v469 = svadd_f32_x(svptrue_b32(), v468, v466); + svfloat32_t v475 = svadd_f32_x(svptrue_b32(), v474, v467); + svfloat32_t zero517 = svdup_n_f32(0); + svfloat32_t v517 = svcmla_f32_x(pred_full, zero517, v1129, v476, 90); + svfloat32_t zero524 = svdup_n_f32(0); + svfloat32_t v524 = svcmla_f32_x(pred_full, zero524, v1130, v477, 90); + svfloat32_t zero531 = svdup_n_f32(0); + svfloat32_t v531 = svcmla_f32_x(pred_full, zero531, v1131, v478, 90); + svfloat32_t v558 = svadd_f32_x(svptrue_b32(), v557, v555); + svfloat32_t v564 = svadd_f32_x(svptrue_b32(), v563, v556); + svfloat32_t zero588 = svdup_n_f32(0); + svfloat32_t v588 = svcmla_f32_x(pred_full, zero588, v1134, v560, 90); + svfloat32_t zero595 = svdup_n_f32(0); + svfloat32_t v595 = svcmla_f32_x(pred_full, zero595, v1135, v561, 90); + svfloat32_t zero602 = svdup_n_f32(0); + svfloat32_t v602 = svcmla_f32_x(pred_full, zero602, v1136, v562, 90); + svfloat32_t v612 = svmul_f32_x(svptrue_b32(), v565, v1138); + svfloat32_t v617 = svmul_f32_x(svptrue_b32(), v566, v1139); + svfloat32_t v291 = svadd_f32_x(svptrue_b32(), v290, v288); + svfloat32_t v297 = svadd_f32_x(svptrue_b32(), v296, v289); + svfloat32_t zero339 = svdup_n_f32(0); + svfloat32_t v339 = svcmla_f32_x(pred_full, zero339, v1129, v298, 90); + svfloat32_t zero346 = svdup_n_f32(0); + svfloat32_t v346 = svcmla_f32_x(pred_full, zero346, v1130, v299, 90); + svfloat32_t zero353 = svdup_n_f32(0); + svfloat32_t v353 = svcmla_f32_x(pred_full, zero353, v1131, v300, 90); + svfloat32_t v380 = svadd_f32_x(svptrue_b32(), v379, v377); + svfloat32_t v386 = svadd_f32_x(svptrue_b32(), v385, v378); + svfloat32_t zero428 = svdup_n_f32(0); + svfloat32_t v428 = svcmla_f32_x(pred_full, zero428, v1129, v387, 90); + svfloat32_t zero435 = svdup_n_f32(0); + svfloat32_t v435 = svcmla_f32_x(pred_full, zero435, v1130, v388, 90); + svfloat32_t zero442 = svdup_n_f32(0); + svfloat32_t v442 = svcmla_f32_x(pred_full, zero442, v1131, v389, 90); + svfloat32_t v470 = svadd_f32_x(svptrue_b32(), v469, v35); + svfloat32_t zero510 = svdup_n_f32(0); + svfloat32_t v510 = svcmla_f32_x(pred_full, zero510, v1128, v475, 90); + svfloat32_t v559 = svadd_f32_x(svptrue_b32(), v558, v53); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v291, v54); + svfloat32_t zero332 = svdup_n_f32(0); + svfloat32_t v332 = svcmla_f32_x(pred_full, zero332, v1128, v297, 90); + svfloat32_t v381 = svadd_f32_x(svptrue_b32(), v380, v55); + svfloat32_t zero421 = svdup_n_f32(0); + svfloat32_t v421 = svcmla_f32_x(pred_full, zero421, v1128, v386, 90); + svfloat32_t v532 = svmla_f32_x(pred_full, v470, v469, v1124); + svfloat32_t v539 = svadd_f32_x(svptrue_b32(), v510, v517); + svfloat32_t v541 = svsub_f32_x(svptrue_b32(), v510, v517); + svfloat32_t v543 = svsub_f32_x(svptrue_b32(), v510, v524); + svfloat32_t zero574 = svdup_n_f32(0); + svfloat32_t v574 = svcmla_f32_x(pred_full, zero574, v1132, v559, 90); + svfloat32_t v630 = svmla_f32_x(pred_full, v612, v564, v1137); + svfloat32_t v632 = svnmls_f32_x(pred_full, v612, v564, v1137); + svfloat32_t v634 = svnmls_f32_x(pred_full, v617, v564, v1137); + svfloat32_t v354 = svmla_f32_x(pred_full, v292, v291, v1124); + svfloat32_t v361 = svadd_f32_x(svptrue_b32(), v332, v339); + svfloat32_t v363 = svsub_f32_x(svptrue_b32(), v332, v339); + svfloat32_t v365 = svsub_f32_x(svptrue_b32(), v332, v346); + svfloat32_t v443 = svmla_f32_x(pred_full, v381, v380, v1124); + svfloat32_t v450 = svadd_f32_x(svptrue_b32(), v421, v428); + svfloat32_t v452 = svsub_f32_x(svptrue_b32(), v421, v428); + svfloat32_t v454 = svsub_f32_x(svptrue_b32(), v421, v435); + svfloat32_t v533 = svmla_f32_x(pred_full, v532, v471, v1125); + svfloat32_t v535 = svmls_f32_x(pred_full, v532, v471, v1125); + svfloat32_t v537 = svmls_f32_x(pred_full, v532, v472, v1126); + svfloat32_t v540 = svadd_f32_x(svptrue_b32(), v539, v524); + svfloat32_t v542 = svsub_f32_x(svptrue_b32(), v541, v531); + svfloat32_t v544 = svadd_f32_x(svptrue_b32(), v543, v531); + svfloat32_t v623 = svcmla_f32_x(pred_full, v574, v1133, v558, 90); + svfloat32_t v631 = svmla_f32_x(pred_full, v630, v566, v1139); + svfloat32_t v633 = svmls_f32_x(pred_full, v632, v567, v1140); + svfloat32_t v635 = svmla_f32_x(pred_full, v634, v567, v1140); + svfloat32_t v642 = svadd_f32_x(svptrue_b32(), v470, v574); + svfloat32_t v643 = svsub_f32_x(svptrue_b32(), v470, v574); + svst1_f64(pred_full, (double *)(v1148), svreinterpret_f64_f32(v292)); + svst1_f64(pred_full, (double *)(v1166), svreinterpret_f64_f32(v381)); + svfloat32_t v355 = svmla_f32_x(pred_full, v354, v293, v1125); + svfloat32_t v357 = svmls_f32_x(pred_full, v354, v293, v1125); + svfloat32_t v359 = svmls_f32_x(pred_full, v354, v294, v1126); + svfloat32_t v362 = svadd_f32_x(svptrue_b32(), v361, v346); + svfloat32_t v364 = svsub_f32_x(svptrue_b32(), v363, v353); + svfloat32_t v366 = svadd_f32_x(svptrue_b32(), v365, v353); + svfloat32_t v444 = svmla_f32_x(pred_full, v443, v382, v1125); + svfloat32_t v446 = svmls_f32_x(pred_full, v443, v382, v1125); + svfloat32_t v448 = svmls_f32_x(pred_full, v443, v383, v1126); + svfloat32_t v451 = svadd_f32_x(svptrue_b32(), v450, v435); + svfloat32_t v453 = svsub_f32_x(svptrue_b32(), v452, v442); + svfloat32_t v455 = svadd_f32_x(svptrue_b32(), v454, v442); + svfloat32_t v534 = svmla_f32_x(pred_full, v533, v472, v1126); + svfloat32_t v536 = svmls_f32_x(pred_full, v535, v473, v1127); + svfloat32_t v538 = svmla_f32_x(pred_full, v537, v473, v1127); + svfloat32_t v624 = svadd_f32_x(svptrue_b32(), v623, v588); + svfloat32_t v626 = svsub_f32_x(svptrue_b32(), v623, v588); + svfloat32_t v628 = svsub_f32_x(svptrue_b32(), v623, v595); + svst1_f64(pred_full, (double *)(v1157), svreinterpret_f64_f32(v643)); + svst1_f64(pred_full, (double *)(v1175), svreinterpret_f64_f32(v642)); + svfloat32_t v356 = svmla_f32_x(pred_full, v355, v294, v1126); + svfloat32_t v358 = svmls_f32_x(pred_full, v357, v295, v1127); + svfloat32_t v360 = svmla_f32_x(pred_full, v359, v295, v1127); + svfloat32_t v445 = svmla_f32_x(pred_full, v444, v383, v1126); + svfloat32_t v447 = svmls_f32_x(pred_full, v446, v384, v1127); + svfloat32_t v449 = svmla_f32_x(pred_full, v448, v384, v1127); + svfloat32_t v545 = svadd_f32_x(svptrue_b32(), v534, v540); + svfloat32_t v546 = svsub_f32_x(svptrue_b32(), v534, v540); + svfloat32_t v547 = svadd_f32_x(svptrue_b32(), v536, v542); + svfloat32_t v548 = svsub_f32_x(svptrue_b32(), v536, v542); + svfloat32_t v549 = svadd_f32_x(svptrue_b32(), v538, v544); + svfloat32_t v550 = svsub_f32_x(svptrue_b32(), v538, v544); + svfloat32_t v625 = svadd_f32_x(svptrue_b32(), v624, v595); + svfloat32_t v627 = svsub_f32_x(svptrue_b32(), v626, v602); + svfloat32_t v629 = svadd_f32_x(svptrue_b32(), v628, v602); + svfloat32_t v367 = svadd_f32_x(svptrue_b32(), v356, v362); + svfloat32_t v368 = svsub_f32_x(svptrue_b32(), v356, v362); + svfloat32_t v369 = svadd_f32_x(svptrue_b32(), v358, v364); + svfloat32_t v370 = svsub_f32_x(svptrue_b32(), v358, v364); + svfloat32_t v371 = svadd_f32_x(svptrue_b32(), v360, v366); + svfloat32_t v372 = svsub_f32_x(svptrue_b32(), v360, v366); + svfloat32_t v456 = svadd_f32_x(svptrue_b32(), v445, v451); + svfloat32_t v457 = svsub_f32_x(svptrue_b32(), v445, v451); + svfloat32_t v458 = svadd_f32_x(svptrue_b32(), v447, v453); + svfloat32_t v459 = svsub_f32_x(svptrue_b32(), v447, v453); + svfloat32_t v460 = svadd_f32_x(svptrue_b32(), v449, v455); + svfloat32_t v461 = svsub_f32_x(svptrue_b32(), v449, v455); + svfloat32_t v636 = svadd_f32_x(svptrue_b32(), v625, v631); + svfloat32_t v637 = svsub_f32_x(svptrue_b32(), v625, v631); + svfloat32_t v638 = svadd_f32_x(svptrue_b32(), v627, v633); + svfloat32_t v639 = svsub_f32_x(svptrue_b32(), v627, v633); + svfloat32_t v640 = svadd_f32_x(svptrue_b32(), v629, v635); + svfloat32_t v641 = svsub_f32_x(svptrue_b32(), v629, v635); + svfloat32_t v672 = svadd_f32_x(svptrue_b32(), v546, v637); + svfloat32_t v673 = svsub_f32_x(svptrue_b32(), v546, v637); + svfloat32_t v702 = svadd_f32_x(svptrue_b32(), v548, v639); + svfloat32_t v703 = svsub_f32_x(svptrue_b32(), v548, v639); + svfloat32_t v732 = svadd_f32_x(svptrue_b32(), v549, v640); + svfloat32_t v733 = svsub_f32_x(svptrue_b32(), v549, v640); + svfloat32_t v762 = svadd_f32_x(svptrue_b32(), v550, v641); + svfloat32_t v763 = svsub_f32_x(svptrue_b32(), v550, v641); + svfloat32_t v792 = svadd_f32_x(svptrue_b32(), v547, v638); + svfloat32_t v793 = svsub_f32_x(svptrue_b32(), v547, v638); + svfloat32_t v822 = svadd_f32_x(svptrue_b32(), v545, v636); + svfloat32_t v823 = svsub_f32_x(svptrue_b32(), v545, v636); + svst1_f64(pred_full, (double *)(v1184), svreinterpret_f64_f32(v368)); + svst1_f64(pred_full, (double *)(v1202), svreinterpret_f64_f32(v457)); + svst1_f64(pred_full, (double *)(v1220), svreinterpret_f64_f32(v370)); + svst1_f64(pred_full, (double *)(v1238), svreinterpret_f64_f32(v459)); + svst1_f64(pred_full, (double *)(v1256), svreinterpret_f64_f32(v371)); + svst1_f64(pred_full, (double *)(v1274), svreinterpret_f64_f32(v460)); + svst1_f64(pred_full, (double *)(v1292), svreinterpret_f64_f32(v372)); + svst1_f64(pred_full, (double *)(v1310), svreinterpret_f64_f32(v461)); + svst1_f64(pred_full, (double *)(v1328), svreinterpret_f64_f32(v369)); + svst1_f64(pred_full, (double *)(v1346), svreinterpret_f64_f32(v458)); + svst1_f64(pred_full, (double *)(v1364), svreinterpret_f64_f32(v367)); + svst1_f64(pred_full, (double *)(v1382), svreinterpret_f64_f32(v456)); + svst1_f64(pred_full, (double *)(v1193), svreinterpret_f64_f32(v673)); + svst1_f64(pred_full, (double *)(v1211), svreinterpret_f64_f32(v672)); + svst1_f64(pred_full, (double *)(v1229), svreinterpret_f64_f32(v703)); + svst1_f64(pred_full, (double *)(v1247), svreinterpret_f64_f32(v702)); + svst1_f64(pred_full, (double *)(v1265), svreinterpret_f64_f32(v733)); + svst1_f64(pred_full, (double *)(v1283), svreinterpret_f64_f32(v732)); + svst1_f64(pred_full, (double *)(v1301), svreinterpret_f64_f32(v763)); + svst1_f64(pred_full, (double *)(v1319), svreinterpret_f64_f32(v762)); + svst1_f64(pred_full, (double *)(v1337), svreinterpret_f64_f32(v793)); + svst1_f64(pred_full, (double *)(v1355), svreinterpret_f64_f32(v792)); + svst1_f64(pred_full, (double *)(v1373), svreinterpret_f64_f32(v823)); + svst1_f64(pred_full, (double *)(v1391), svreinterpret_f64_f32(v822)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu30(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v110 = vld1s_s16(&v5[istride]); + float v434 = -1.2500000000000000e+00F; + float v438 = 5.5901699437494745e-01F; + float v441 = 1.5388417685876268e+00F; + float v442 = -1.5388417685876268e+00F; + float v448 = 5.8778525229247325e-01F; + float v449 = -5.8778525229247325e-01F; + float v455 = 3.6327126400268028e-01F; + float v456 = -3.6327126400268028e-01F; + float v480 = -1.4999999999999998e+00F; + float v484 = 1.8749999999999998e+00F; + float v488 = -8.3852549156242107e-01F; + float v491 = -2.3082626528814396e+00F; + float v492 = 2.3082626528814396e+00F; + float v498 = -8.8167787843870971e-01F; + float v499 = 8.8167787843870971e-01F; + float v505 = -5.4490689600402031e-01F; + float v506 = 5.4490689600402031e-01F; + float v529 = 8.6602540378443871e-01F; + float v530 = -8.6602540378443871e-01F; + float v536 = -1.0825317547305484e+00F; + float v537 = 1.0825317547305484e+00F; + float v543 = 4.8412291827592718e-01F; + float v544 = -4.8412291827592718e-01F; + float32x2_t v546 = (float32x2_t){v4, v4}; + float v551 = -1.3326760640014592e+00F; + float v555 = -5.0903696045512736e-01F; + float v559 = -3.1460214309120460e-01F; + int16x4_t v20 = vld1s_s16(&v5[0]); + float32x2_t v111 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v110)), 15); + float32x2_t v435 = (float32x2_t){v434, v434}; + float32x2_t v439 = (float32x2_t){v438, v438}; + float32x2_t v443 = (float32x2_t){v441, v442}; + float32x2_t v450 = (float32x2_t){v448, v449}; + float32x2_t v457 = (float32x2_t){v455, v456}; + float32x2_t v481 = (float32x2_t){v480, v480}; + float32x2_t v485 = (float32x2_t){v484, v484}; + float32x2_t v489 = (float32x2_t){v488, v488}; + float32x2_t v493 = (float32x2_t){v491, v492}; + float32x2_t v500 = (float32x2_t){v498, v499}; + float32x2_t v507 = (float32x2_t){v505, v506}; + float32x2_t v531 = (float32x2_t){v529, v530}; + float32x2_t v538 = (float32x2_t){v536, v537}; + float32x2_t v545 = (float32x2_t){v543, v544}; + float32x2_t v552 = (float32x2_t){v551, v551}; + float32x2_t v556 = (float32x2_t){v555, v555}; + float32x2_t v560 = (float32x2_t){v559, v559}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v26 = vld1s_s16(&v5[istride * 15]); + int16x4_t v34 = vld1s_s16(&v5[istride * 6]); + int16x4_t v40 = vld1s_s16(&v5[istride * 21]); + int16x4_t v48 = vld1s_s16(&v5[istride * 12]); + int16x4_t v54 = vld1s_s16(&v5[istride * 27]); + int16x4_t v62 = vld1s_s16(&v5[istride * 18]); + int16x4_t v68 = vld1s_s16(&v5[istride * 3]); + int16x4_t v76 = vld1s_s16(&v5[istride * 24]); + int16x4_t v82 = vld1s_s16(&v5[istride * 9]); + int16x4_t v90 = vld1s_s16(&v5[istride * 10]); + int16x4_t v96 = vld1s_s16(&v5[istride * 25]); + int16x4_t v104 = vld1s_s16(&v5[istride * 16]); + int16x4_t v118 = vld1s_s16(&v5[istride * 22]); + int16x4_t v124 = vld1s_s16(&v5[istride * 7]); + int16x4_t v132 = vld1s_s16(&v5[istride * 28]); + int16x4_t v138 = vld1s_s16(&v5[istride * 13]); + int16x4_t v146 = vld1s_s16(&v5[istride * 4]); + int16x4_t v152 = vld1s_s16(&v5[istride * 19]); + int16x4_t v160 = vld1s_s16(&v5[istride * 20]); + int16x4_t v166 = vld1s_s16(&v5[istride * 5]); + int16x4_t v174 = vld1s_s16(&v5[istride * 26]); + int16x4_t v180 = vld1s_s16(&v5[istride * 11]); + int16x4_t v188 = vld1s_s16(&v5[istride * 2]); + int16x4_t v194 = vld1s_s16(&v5[istride * 17]); + int16x4_t v202 = vld1s_s16(&v5[istride * 8]); + int16x4_t v208 = vld1s_s16(&v5[istride * 23]); + int16x4_t v216 = vld1s_s16(&v5[istride * 14]); + int16x4_t v222 = vld1s_s16(&v5[istride * 29]); + float32x2_t v445 = vmul_f32(v546, v443); + float32x2_t v452 = vmul_f32(v546, v450); + float32x2_t v459 = vmul_f32(v546, v457); + float32x2_t v495 = vmul_f32(v546, v493); + float32x2_t v502 = vmul_f32(v546, v500); + float32x2_t v509 = vmul_f32(v546, v507); + float32x2_t v533 = vmul_f32(v546, v531); + float32x2_t v540 = vmul_f32(v546, v538); + float32x2_t v547 = vmul_f32(v546, v545); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v49 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v48)), 15); + float32x2_t v55 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v54)), 15); + float32x2_t v63 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v62)), 15); + float32x2_t v69 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v68)), 15); + float32x2_t v77 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v76)), 15); + float32x2_t v83 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v82)), 15); + float32x2_t v91 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v90)), 15); + float32x2_t v97 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v96)), 15); + float32x2_t v105 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v104)), 15); + float32x2_t v119 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v118)), 15); + float32x2_t v125 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v124)), 15); + float32x2_t v133 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v132)), 15); + float32x2_t v139 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v138)), 15); + float32x2_t v147 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v146)), 15); + float32x2_t v153 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v152)), 15); + float32x2_t v161 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v160)), 15); + float32x2_t v167 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v166)), 15); + float32x2_t v175 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v174)), 15); + float32x2_t v181 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v180)), 15); + float32x2_t v189 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v188)), 15); + float32x2_t v195 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v194)), 15); + float32x2_t v203 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v202)), 15); + float32x2_t v209 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v208)), 15); + float32x2_t v217 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v216)), 15); + float32x2_t v223 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v222)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v56 = vadd_f32(v49, v55); + float32x2_t v57 = vsub_f32(v49, v55); + float32x2_t v70 = vadd_f32(v63, v69); + float32x2_t v71 = vsub_f32(v63, v69); + float32x2_t v84 = vadd_f32(v77, v83); + float32x2_t v85 = vsub_f32(v77, v83); + float32x2_t v98 = vadd_f32(v91, v97); + float32x2_t v99 = vsub_f32(v91, v97); + float32x2_t v112 = vadd_f32(v105, v111); + float32x2_t v113 = vsub_f32(v105, v111); + float32x2_t v126 = vadd_f32(v119, v125); + float32x2_t v127 = vsub_f32(v119, v125); + float32x2_t v140 = vadd_f32(v133, v139); + float32x2_t v141 = vsub_f32(v133, v139); + float32x2_t v154 = vadd_f32(v147, v153); + float32x2_t v155 = vsub_f32(v147, v153); + float32x2_t v168 = vadd_f32(v161, v167); + float32x2_t v169 = vsub_f32(v161, v167); + float32x2_t v182 = vadd_f32(v175, v181); + float32x2_t v183 = vsub_f32(v175, v181); + float32x2_t v196 = vadd_f32(v189, v195); + float32x2_t v197 = vsub_f32(v189, v195); + float32x2_t v210 = vadd_f32(v203, v209); + float32x2_t v211 = vsub_f32(v203, v209); + float32x2_t v224 = vadd_f32(v217, v223); + float32x2_t v225 = vsub_f32(v217, v223); + float32x2_t v226 = vadd_f32(v98, v168); + float32x2_t v227 = vsub_f32(v98, v168); + float32x2_t v229 = vadd_f32(v112, v182); + float32x2_t v230 = vsub_f32(v112, v182); + float32x2_t v232 = vadd_f32(v126, v196); + float32x2_t v233 = vsub_f32(v126, v196); + float32x2_t v235 = vadd_f32(v140, v210); + float32x2_t v236 = vsub_f32(v140, v210); + float32x2_t v238 = vadd_f32(v154, v224); + float32x2_t v239 = vsub_f32(v154, v224); + float32x2_t v406 = vadd_f32(v99, v169); + float32x2_t v407 = vsub_f32(v99, v169); + float32x2_t v409 = vadd_f32(v113, v183); + float32x2_t v410 = vsub_f32(v113, v183); + float32x2_t v412 = vadd_f32(v127, v197); + float32x2_t v413 = vsub_f32(v127, v197); + float32x2_t v415 = vadd_f32(v141, v211); + float32x2_t v416 = vsub_f32(v141, v211); + float32x2_t v418 = vadd_f32(v155, v225); + float32x2_t v419 = vsub_f32(v155, v225); + float32x2_t v228 = vadd_f32(v226, v28); + float32x2_t v231 = vadd_f32(v229, v42); + float32x2_t v234 = vadd_f32(v232, v56); + float32x2_t v237 = vadd_f32(v235, v70); + float32x2_t v240 = vadd_f32(v238, v84); + float32x2_t v291 = vadd_f32(v229, v238); + float32x2_t v292 = vsub_f32(v229, v238); + float32x2_t v293 = vadd_f32(v235, v232); + float32x2_t v294 = vsub_f32(v235, v232); + float32x2_t v341 = vadd_f32(v230, v239); + float32x2_t v342 = vsub_f32(v230, v239); + float32x2_t v343 = vadd_f32(v236, v233); + float32x2_t v344 = vsub_f32(v236, v233); + float32x2_t v408 = vadd_f32(v406, v29); + float32x2_t v411 = vadd_f32(v409, v43); + float32x2_t v414 = vadd_f32(v412, v57); + float32x2_t v417 = vadd_f32(v415, v71); + float32x2_t v420 = vadd_f32(v418, v85); + float32x2_t v471 = vadd_f32(v409, v418); + float32x2_t v472 = vsub_f32(v409, v418); + float32x2_t v473 = vadd_f32(v415, v412); + float32x2_t v474 = vsub_f32(v415, v412); + float32x2_t v521 = vadd_f32(v410, v419); + float32x2_t v522 = vsub_f32(v410, v419); + float32x2_t v523 = vadd_f32(v416, v413); + float32x2_t v524 = vsub_f32(v416, v413); + float32x2_t v241 = vadd_f32(v231, v240); + float32x2_t v242 = vsub_f32(v231, v240); + float32x2_t v243 = vadd_f32(v237, v234); + float32x2_t v244 = vsub_f32(v237, v234); + float32x2_t v295 = vadd_f32(v291, v293); + float32x2_t v296 = vsub_f32(v291, v293); + float32x2_t v297 = vadd_f32(v292, v294); + float32x2_t v316 = vrev64_f32(v292); + float32x2_t v330 = vrev64_f32(v294); + float32x2_t v345 = vadd_f32(v341, v343); + float32x2_t v346 = vsub_f32(v341, v343); + float32x2_t v347 = vadd_f32(v342, v344); + float32x2_t v373 = vmul_f32(v342, v552); + float32x2_t v381 = vmul_f32(v344, v560); + float32x2_t v421 = vadd_f32(v411, v420); + float32x2_t v422 = vsub_f32(v411, v420); + float32x2_t v423 = vadd_f32(v417, v414); + float32x2_t v424 = vsub_f32(v417, v414); + float32x2_t v475 = vadd_f32(v471, v473); + float32x2_t v476 = vsub_f32(v471, v473); + float32x2_t v477 = vadd_f32(v472, v474); + float32x2_t v496 = vrev64_f32(v472); + float32x2_t v510 = vrev64_f32(v474); + float32x2_t v525 = vadd_f32(v521, v523); + float32x2_t v526 = vsub_f32(v521, v523); + float32x2_t v527 = vadd_f32(v522, v524); + float32x2_t v553 = vmul_f32(v522, v552); + float32x2_t v561 = vmul_f32(v524, v560); + float32x2_t v245 = vadd_f32(v241, v243); + float32x2_t v246 = vsub_f32(v241, v243); + float32x2_t v247 = vadd_f32(v242, v244); + float32x2_t v266 = vrev64_f32(v242); + float32x2_t v280 = vrev64_f32(v244); + float32x2_t v298 = vadd_f32(v295, v226); + float32x2_t v306 = vmul_f32(v295, v485); + float32x2_t v310 = vmul_f32(v296, v489); + float32x2_t v317 = vmul_f32(v316, v495); + float32x2_t v323 = vrev64_f32(v297); + float32x2_t v331 = vmul_f32(v330, v509); + float32x2_t v348 = vadd_f32(v345, v227); + float32x2_t v361 = vrev64_f32(v345); + float32x2_t v368 = vrev64_f32(v346); + float32x2_t v377 = vmul_f32(v347, v556); + float32x2_t v425 = vadd_f32(v421, v423); + float32x2_t v426 = vsub_f32(v421, v423); + float32x2_t v427 = vadd_f32(v422, v424); + float32x2_t v446 = vrev64_f32(v422); + float32x2_t v460 = vrev64_f32(v424); + float32x2_t v478 = vadd_f32(v475, v406); + float32x2_t v486 = vmul_f32(v475, v485); + float32x2_t v490 = vmul_f32(v476, v489); + float32x2_t v497 = vmul_f32(v496, v495); + float32x2_t v503 = vrev64_f32(v477); + float32x2_t v511 = vmul_f32(v510, v509); + float32x2_t v528 = vadd_f32(v525, v407); + float32x2_t v541 = vrev64_f32(v525); + float32x2_t v548 = vrev64_f32(v526); + float32x2_t v557 = vmul_f32(v527, v556); + float32x2_t v248 = vadd_f32(v245, v228); + float32x2_t v256 = vmul_f32(v245, v435); + float32x2_t v260 = vmul_f32(v246, v439); + float32x2_t v267 = vmul_f32(v266, v445); + float32x2_t v273 = vrev64_f32(v247); + float32x2_t v281 = vmul_f32(v280, v459); + float32x2_t v302 = vmul_f32(v298, v481); + float32x2_t v324 = vmul_f32(v323, v502); + float32x2_t v354 = vrev64_f32(v348); + float32x2_t v362 = vmul_f32(v361, v540); + float32x2_t v369 = vmul_f32(v368, v547); + float32x2_t v385 = vsub_f32(v373, v377); + float32x2_t v386 = vadd_f32(v377, v381); + float32x2_t v428 = vadd_f32(v425, v408); + float32x2_t v436 = vmul_f32(v425, v435); + float32x2_t v440 = vmul_f32(v426, v439); + float32x2_t v447 = vmul_f32(v446, v445); + float32x2_t v453 = vrev64_f32(v427); + float32x2_t v461 = vmul_f32(v460, v459); + float32x2_t v482 = vmul_f32(v478, v481); + float32x2_t v504 = vmul_f32(v503, v502); + float32x2_t v534 = vrev64_f32(v528); + float32x2_t v542 = vmul_f32(v541, v540); + float32x2_t v549 = vmul_f32(v548, v547); + float32x2_t v565 = vsub_f32(v553, v557); + float32x2_t v566 = vadd_f32(v557, v561); + float32x2_t v274 = vmul_f32(v273, v452); + float32x2_t v282 = vadd_f32(v248, v256); + float32x2_t v332 = vadd_f32(v302, v306); + float32x2_t v335 = vsub_f32(v317, v324); + float32x2_t v336 = vadd_f32(v324, v331); + float32x2_t v355 = vmul_f32(v354, v533); + float32x2_t v391 = vadd_f32(v248, v302); + float32x2_t v454 = vmul_f32(v453, v452); + float32x2_t v462 = vadd_f32(v428, v436); + float32x2_t v512 = vadd_f32(v482, v486); + float32x2_t v515 = vsub_f32(v497, v504); + float32x2_t v516 = vadd_f32(v504, v511); + float32x2_t v535 = vmul_f32(v534, v533); + float32x2_t v571 = vadd_f32(v428, v482); + v6[0] = v248; + v6[ostride * 15] = v428; + float32x2_t v283 = vadd_f32(v282, v260); + float32x2_t v284 = vsub_f32(v282, v260); + float32x2_t v285 = vsub_f32(v267, v274); + float32x2_t v286 = vadd_f32(v274, v281); + float32x2_t v333 = vadd_f32(v332, v310); + float32x2_t v334 = vsub_f32(v332, v310); + float32x2_t v382 = vadd_f32(v355, v362); + float32x2_t v392 = vadd_f32(v391, v355); + float32x2_t v393 = vsub_f32(v391, v355); + float32x2_t v463 = vadd_f32(v462, v440); + float32x2_t v464 = vsub_f32(v462, v440); + float32x2_t v465 = vsub_f32(v447, v454); + float32x2_t v466 = vadd_f32(v454, v461); + float32x2_t v513 = vadd_f32(v512, v490); + float32x2_t v514 = vsub_f32(v512, v490); + float32x2_t v562 = vadd_f32(v535, v542); + float32x2_t v572 = vadd_f32(v571, v535); + float32x2_t v573 = vsub_f32(v571, v535); + float32x2_t v287 = vadd_f32(v283, v285); + float32x2_t v288 = vsub_f32(v283, v285); + float32x2_t v289 = vadd_f32(v284, v286); + float32x2_t v290 = vsub_f32(v284, v286); + float32x2_t v337 = vadd_f32(v333, v335); + float32x2_t v338 = vsub_f32(v333, v335); + float32x2_t v339 = vadd_f32(v334, v336); + float32x2_t v340 = vsub_f32(v334, v336); + float32x2_t v383 = vadd_f32(v382, v369); + float32x2_t v384 = vsub_f32(v382, v369); + float32x2_t v467 = vadd_f32(v463, v465); + float32x2_t v468 = vsub_f32(v463, v465); + float32x2_t v469 = vadd_f32(v464, v466); + float32x2_t v470 = vsub_f32(v464, v466); + float32x2_t v517 = vadd_f32(v513, v515); + float32x2_t v518 = vsub_f32(v513, v515); + float32x2_t v519 = vadd_f32(v514, v516); + float32x2_t v520 = vsub_f32(v514, v516); + float32x2_t v563 = vadd_f32(v562, v549); + float32x2_t v564 = vsub_f32(v562, v549); + v6[ostride * 10] = v393; + v6[ostride * 25] = v573; + v6[ostride * 20] = v392; + v6[ostride * 5] = v572; + float32x2_t v387 = vadd_f32(v383, v385); + float32x2_t v388 = vsub_f32(v383, v385); + float32x2_t v389 = vadd_f32(v384, v386); + float32x2_t v390 = vsub_f32(v384, v386); + float32x2_t v394 = vadd_f32(v288, v338); + float32x2_t v397 = vadd_f32(v290, v340); + float32x2_t v400 = vadd_f32(v289, v339); + float32x2_t v403 = vadd_f32(v287, v337); + float32x2_t v567 = vadd_f32(v563, v565); + float32x2_t v568 = vsub_f32(v563, v565); + float32x2_t v569 = vadd_f32(v564, v566); + float32x2_t v570 = vsub_f32(v564, v566); + float32x2_t v574 = vadd_f32(v468, v518); + float32x2_t v577 = vadd_f32(v470, v520); + float32x2_t v580 = vadd_f32(v469, v519); + float32x2_t v583 = vadd_f32(v467, v517); + v6[ostride * 6] = v288; + v6[ostride * 21] = v468; + v6[ostride * 12] = v290; + v6[ostride * 27] = v470; + v6[ostride * 18] = v289; + v6[ostride * 3] = v469; + v6[ostride * 24] = v287; + v6[ostride * 9] = v467; + float32x2_t v395 = vadd_f32(v394, v388); + float32x2_t v396 = vsub_f32(v394, v388); + float32x2_t v398 = vadd_f32(v397, v390); + float32x2_t v399 = vsub_f32(v397, v390); + float32x2_t v401 = vadd_f32(v400, v389); + float32x2_t v402 = vsub_f32(v400, v389); + float32x2_t v404 = vadd_f32(v403, v387); + float32x2_t v405 = vsub_f32(v403, v387); + float32x2_t v575 = vadd_f32(v574, v568); + float32x2_t v576 = vsub_f32(v574, v568); + float32x2_t v578 = vadd_f32(v577, v570); + float32x2_t v579 = vsub_f32(v577, v570); + float32x2_t v581 = vadd_f32(v580, v569); + float32x2_t v582 = vsub_f32(v580, v569); + float32x2_t v584 = vadd_f32(v583, v567); + float32x2_t v585 = vsub_f32(v583, v567); + v6[ostride * 16] = v396; + v6[ostride] = v576; + v6[ostride * 22] = v399; + v6[ostride * 7] = v579; + v6[ostride * 28] = v402; + v6[ostride * 13] = v582; + v6[ostride * 4] = v405; + v6[ostride * 19] = v585; + v6[ostride * 26] = v395; + v6[ostride * 11] = v575; + v6[ostride * 2] = v398; + v6[ostride * 17] = v578; + v6[ostride * 8] = v401; + v6[ostride * 23] = v581; + v6[ostride * 14] = v404; + v6[ostride * 29] = v584; + v5 += 1 * idist; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu30(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v506 = -1.2500000000000000e+00F; + float v511 = 5.5901699437494745e-01F; + float v516 = -1.5388417685876268e+00F; + float v523 = -5.8778525229247325e-01F; + float v530 = -3.6327126400268028e-01F; + float v554 = -1.4999999999999998e+00F; + float v559 = 1.8749999999999998e+00F; + float v564 = -8.3852549156242107e-01F; + float v569 = 2.3082626528814396e+00F; + float v576 = 8.8167787843870971e-01F; + float v583 = 5.4490689600402031e-01F; + float v607 = -8.6602540378443871e-01F; + float v614 = 1.0825317547305484e+00F; + float v621 = -4.8412291827592718e-01F; + float v628 = -1.3326760640014592e+00F; + float v633 = -5.0903696045512736e-01F; + float v638 = -3.1460214309120460e-01F; + const int32_t *v1000 = &v5[v0]; + float32x2_t *v1307 = &v6[v2]; + int64_t v27 = v0 * 15; + int64_t v37 = v0 * 6; + int64_t v45 = v0 * 21; + int64_t v55 = v0 * 12; + int64_t v63 = v0 * 27; + int64_t v73 = v0 * 18; + int64_t v81 = v0 * 3; + int64_t v91 = v0 * 24; + int64_t v99 = v0 * 9; + int64_t v109 = v0 * 10; + int64_t v117 = v0 * 25; + int64_t v127 = v0 * 16; + int64_t v145 = v0 * 22; + int64_t v153 = v0 * 7; + int64_t v163 = v0 * 28; + int64_t v171 = v0 * 13; + int64_t v181 = v0 * 4; + int64_t v189 = v0 * 19; + int64_t v199 = v0 * 20; + int64_t v207 = v0 * 5; + int64_t v217 = v0 * 26; + int64_t v225 = v0 * 11; + int64_t v235 = v0 * 2; + int64_t v243 = v0 * 17; + int64_t v253 = v0 * 8; + int64_t v261 = v0 * 23; + int64_t v271 = v0 * 14; + int64_t v279 = v0 * 29; + float v519 = v4 * v516; + float v526 = v4 * v523; + float v533 = v4 * v530; + float v572 = v4 * v569; + float v579 = v4 * v576; + float v586 = v4 * v583; + float v610 = v4 * v607; + float v617 = v4 * v614; + float v624 = v4 * v621; + int64_t v674 = v2 * 15; + int64_t v681 = v2 * 6; + int64_t v688 = v2 * 21; + int64_t v695 = v2 * 12; + int64_t v702 = v2 * 27; + int64_t v709 = v2 * 18; + int64_t v716 = v2 * 3; + int64_t v723 = v2 * 24; + int64_t v730 = v2 * 9; + int64_t v737 = v2 * 10; + int64_t v744 = v2 * 25; + int64_t v751 = v2 * 16; + int64_t v765 = v2 * 22; + int64_t v772 = v2 * 7; + int64_t v779 = v2 * 28; + int64_t v786 = v2 * 13; + int64_t v793 = v2 * 4; + int64_t v800 = v2 * 19; + int64_t v807 = v2 * 20; + int64_t v814 = v2 * 5; + int64_t v821 = v2 * 26; + int64_t v828 = v2 * 11; + int64_t v835 = v2 * 2; + int64_t v842 = v2 * 17; + int64_t v849 = v2 * 8; + int64_t v856 = v2 * 23; + int64_t v863 = v2 * 14; + int64_t v870 = v2 * 29; + const int32_t *v883 = &v5[0]; + svint64_t v1145 = svindex_s64(0, v1); + svfloat32_t v1166 = svdup_n_f32(v506); + svfloat32_t v1167 = svdup_n_f32(v511); + svfloat32_t v1171 = svdup_n_f32(v554); + svfloat32_t v1172 = svdup_n_f32(v559); + svfloat32_t v1173 = svdup_n_f32(v564); + svfloat32_t v1180 = svdup_n_f32(v628); + svfloat32_t v1181 = svdup_n_f32(v633); + svfloat32_t v1182 = svdup_n_f32(v638); + float32x2_t *v1190 = &v6[0]; + svint16_t v885 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v883), v1145)); + const int32_t *v892 = &v5[v27]; + const int32_t *v901 = &v5[v37]; + const int32_t *v910 = &v5[v45]; + const int32_t *v919 = &v5[v55]; + const int32_t *v928 = &v5[v63]; + const int32_t *v937 = &v5[v73]; + const int32_t *v946 = &v5[v81]; + const int32_t *v955 = &v5[v91]; + const int32_t *v964 = &v5[v99]; + const int32_t *v973 = &v5[v109]; + const int32_t *v982 = &v5[v117]; + const int32_t *v991 = &v5[v127]; + svint16_t v1002 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1000), v1145)); + const int32_t *v1009 = &v5[v145]; + const int32_t *v1018 = &v5[v153]; + const int32_t *v1027 = &v5[v163]; + const int32_t *v1036 = &v5[v171]; + const int32_t *v1045 = &v5[v181]; + const int32_t *v1054 = &v5[v189]; + const int32_t *v1063 = &v5[v199]; + const int32_t *v1072 = &v5[v207]; + const int32_t *v1081 = &v5[v217]; + const int32_t *v1090 = &v5[v225]; + const int32_t *v1099 = &v5[v235]; + const int32_t *v1108 = &v5[v243]; + const int32_t *v1117 = &v5[v253]; + const int32_t *v1126 = &v5[v261]; + const int32_t *v1135 = &v5[v271]; + const int32_t *v1144 = &v5[v279]; + svfloat32_t v1168 = svdup_n_f32(v519); + svfloat32_t v1169 = svdup_n_f32(v526); + svfloat32_t v1170 = svdup_n_f32(v533); + svfloat32_t v1174 = svdup_n_f32(v572); + svfloat32_t v1175 = svdup_n_f32(v579); + svfloat32_t v1176 = svdup_n_f32(v586); + svfloat32_t v1177 = svdup_n_f32(v610); + svfloat32_t v1178 = svdup_n_f32(v617); + svfloat32_t v1179 = svdup_n_f32(v624); + float32x2_t *v1199 = &v6[v674]; + float32x2_t *v1208 = &v6[v681]; + float32x2_t *v1217 = &v6[v688]; + float32x2_t *v1226 = &v6[v695]; + float32x2_t *v1235 = &v6[v702]; + float32x2_t *v1244 = &v6[v709]; + float32x2_t *v1253 = &v6[v716]; + float32x2_t *v1262 = &v6[v723]; + float32x2_t *v1271 = &v6[v730]; + float32x2_t *v1280 = &v6[v737]; + float32x2_t *v1289 = &v6[v744]; + float32x2_t *v1298 = &v6[v751]; + float32x2_t *v1316 = &v6[v765]; + float32x2_t *v1325 = &v6[v772]; + float32x2_t *v1334 = &v6[v779]; + float32x2_t *v1343 = &v6[v786]; + float32x2_t *v1352 = &v6[v793]; + float32x2_t *v1361 = &v6[v800]; + float32x2_t *v1370 = &v6[v807]; + float32x2_t *v1379 = &v6[v814]; + float32x2_t *v1388 = &v6[v821]; + float32x2_t *v1397 = &v6[v828]; + float32x2_t *v1406 = &v6[v835]; + float32x2_t *v1415 = &v6[v842]; + float32x2_t *v1424 = &v6[v849]; + float32x2_t *v1433 = &v6[v856]; + float32x2_t *v1442 = &v6[v863]; + float32x2_t *v1451 = &v6[v870]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v885, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v141 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1002, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svint16_t v894 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v892), v1145)); + svint16_t v903 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v901), v1145)); + svint16_t v912 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v910), v1145)); + svint16_t v921 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v919), v1145)); + svint16_t v930 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v928), v1145)); + svint16_t v939 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v937), v1145)); + svint16_t v948 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v946), v1145)); + svint16_t v957 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v955), v1145)); + svint16_t v966 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v964), v1145)); + svint16_t v975 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v973), v1145)); + svint16_t v984 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v982), v1145)); + svint16_t v993 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v991), v1145)); + svint16_t v1011 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1009), v1145)); + svint16_t v1020 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1018), v1145)); + svint16_t v1029 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1027), v1145)); + svint16_t v1038 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1036), v1145)); + svint16_t v1047 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1045), v1145)); + svint16_t v1056 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1054), v1145)); + svint16_t v1065 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1063), v1145)); + svint16_t v1074 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1072), v1145)); + svint16_t v1083 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1081), v1145)); + svint16_t v1092 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1090), v1145)); + svint16_t v1101 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1099), v1145)); + svint16_t v1110 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1108), v1145)); + svint16_t v1119 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1117), v1145)); + svint16_t v1128 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1126), v1145)); + svint16_t v1137 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1135), v1145)); + svint16_t v1146 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1144), v1145)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v894, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v903, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v51 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v912, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v61 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v921, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v69 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v930, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v79 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v939, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v87 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v948, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v97 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v957, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v105 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v966, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v115 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v975, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v123 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v984, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v133 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v993, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v151 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1011, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v159 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1020, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v169 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1029, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v177 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1038, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v187 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1047, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v195 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1056, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v205 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1065, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v213 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1074, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v223 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1083, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v231 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1092, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v241 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1101, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v249 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1110, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v259 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1119, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v267 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1128, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v277 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1137, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v285 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1146, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v70 = svadd_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v71 = svsub_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v106 = svadd_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v151, v159); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v151, v159); + svfloat32_t v178 = svadd_f32_x(svptrue_b32(), v169, v177); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v169, v177); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v187, v195); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v187, v195); + svfloat32_t v214 = svadd_f32_x(svptrue_b32(), v205, v213); + svfloat32_t v215 = svsub_f32_x(svptrue_b32(), v205, v213); + svfloat32_t v232 = svadd_f32_x(svptrue_b32(), v223, v231); + svfloat32_t v233 = svsub_f32_x(svptrue_b32(), v223, v231); + svfloat32_t v250 = svadd_f32_x(svptrue_b32(), v241, v249); + svfloat32_t v251 = svsub_f32_x(svptrue_b32(), v241, v249); + svfloat32_t v268 = svadd_f32_x(svptrue_b32(), v259, v267); + svfloat32_t v269 = svsub_f32_x(svptrue_b32(), v259, v267); + svfloat32_t v286 = svadd_f32_x(svptrue_b32(), v277, v285); + svfloat32_t v287 = svsub_f32_x(svptrue_b32(), v277, v285); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v124, v214); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v124, v214); + svfloat32_t v291 = svadd_f32_x(svptrue_b32(), v142, v232); + svfloat32_t v292 = svsub_f32_x(svptrue_b32(), v142, v232); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v160, v250); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v160, v250); + svfloat32_t v297 = svadd_f32_x(svptrue_b32(), v178, v268); + svfloat32_t v298 = svsub_f32_x(svptrue_b32(), v178, v268); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v196, v286); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v196, v286); + svfloat32_t v477 = svadd_f32_x(svptrue_b32(), v125, v215); + svfloat32_t v478 = svsub_f32_x(svptrue_b32(), v125, v215); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v143, v233); + svfloat32_t v481 = svsub_f32_x(svptrue_b32(), v143, v233); + svfloat32_t v483 = svadd_f32_x(svptrue_b32(), v161, v251); + svfloat32_t v484 = svsub_f32_x(svptrue_b32(), v161, v251); + svfloat32_t v486 = svadd_f32_x(svptrue_b32(), v179, v269); + svfloat32_t v487 = svsub_f32_x(svptrue_b32(), v179, v269); + svfloat32_t v489 = svadd_f32_x(svptrue_b32(), v197, v287); + svfloat32_t v490 = svsub_f32_x(svptrue_b32(), v197, v287); + svfloat32_t v290 = svadd_f32_x(svptrue_b32(), v288, v34); + svfloat32_t v293 = svadd_f32_x(svptrue_b32(), v291, v52); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v294, v70); + svfloat32_t v299 = svadd_f32_x(svptrue_b32(), v297, v88); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v300, v106); + svfloat32_t v356 = svadd_f32_x(svptrue_b32(), v291, v300); + svfloat32_t v357 = svsub_f32_x(svptrue_b32(), v291, v300); + svfloat32_t v358 = svadd_f32_x(svptrue_b32(), v297, v294); + svfloat32_t v359 = svsub_f32_x(svptrue_b32(), v297, v294); + svfloat32_t v409 = svadd_f32_x(svptrue_b32(), v292, v301); + svfloat32_t v410 = svsub_f32_x(svptrue_b32(), v292, v301); + svfloat32_t v411 = svadd_f32_x(svptrue_b32(), v298, v295); + svfloat32_t v412 = svsub_f32_x(svptrue_b32(), v298, v295); + svfloat32_t v479 = svadd_f32_x(svptrue_b32(), v477, v35); + svfloat32_t v482 = svadd_f32_x(svptrue_b32(), v480, v53); + svfloat32_t v485 = svadd_f32_x(svptrue_b32(), v483, v71); + svfloat32_t v488 = svadd_f32_x(svptrue_b32(), v486, v89); + svfloat32_t v491 = svadd_f32_x(svptrue_b32(), v489, v107); + svfloat32_t v545 = svadd_f32_x(svptrue_b32(), v480, v489); + svfloat32_t v546 = svsub_f32_x(svptrue_b32(), v480, v489); + svfloat32_t v547 = svadd_f32_x(svptrue_b32(), v486, v483); + svfloat32_t v548 = svsub_f32_x(svptrue_b32(), v486, v483); + svfloat32_t v598 = svadd_f32_x(svptrue_b32(), v481, v490); + svfloat32_t v599 = svsub_f32_x(svptrue_b32(), v481, v490); + svfloat32_t v600 = svadd_f32_x(svptrue_b32(), v487, v484); + svfloat32_t v601 = svsub_f32_x(svptrue_b32(), v487, v484); + svfloat32_t v303 = svadd_f32_x(svptrue_b32(), v293, v302); + svfloat32_t v304 = svsub_f32_x(svptrue_b32(), v293, v302); + svfloat32_t v305 = svadd_f32_x(svptrue_b32(), v299, v296); + svfloat32_t v306 = svsub_f32_x(svptrue_b32(), v299, v296); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v356, v358); + svfloat32_t v361 = svsub_f32_x(svptrue_b32(), v356, v358); + svfloat32_t v362 = svadd_f32_x(svptrue_b32(), v357, v359); + svfloat32_t zero385 = svdup_n_f32(0); + svfloat32_t v385 = svcmla_f32_x(pred_full, zero385, v1174, v357, 90); + svfloat32_t v413 = svadd_f32_x(svptrue_b32(), v409, v411); + svfloat32_t v414 = svsub_f32_x(svptrue_b32(), v409, v411); + svfloat32_t v415 = svadd_f32_x(svptrue_b32(), v410, v412); + svfloat32_t v452 = svmul_f32_x(svptrue_b32(), v412, v1182); + svfloat32_t v492 = svadd_f32_x(svptrue_b32(), v482, v491); + svfloat32_t v493 = svsub_f32_x(svptrue_b32(), v482, v491); + svfloat32_t v494 = svadd_f32_x(svptrue_b32(), v488, v485); + svfloat32_t v495 = svsub_f32_x(svptrue_b32(), v488, v485); + svfloat32_t v549 = svadd_f32_x(svptrue_b32(), v545, v547); + svfloat32_t v550 = svsub_f32_x(svptrue_b32(), v545, v547); + svfloat32_t v551 = svadd_f32_x(svptrue_b32(), v546, v548); + svfloat32_t zero574 = svdup_n_f32(0); + svfloat32_t v574 = svcmla_f32_x(pred_full, zero574, v1174, v546, 90); + svfloat32_t v602 = svadd_f32_x(svptrue_b32(), v598, v600); + svfloat32_t v603 = svsub_f32_x(svptrue_b32(), v598, v600); + svfloat32_t v604 = svadd_f32_x(svptrue_b32(), v599, v601); + svfloat32_t v641 = svmul_f32_x(svptrue_b32(), v601, v1182); + svfloat32_t v307 = svadd_f32_x(svptrue_b32(), v303, v305); + svfloat32_t v308 = svsub_f32_x(svptrue_b32(), v303, v305); + svfloat32_t v309 = svadd_f32_x(svptrue_b32(), v304, v306); + svfloat32_t zero332 = svdup_n_f32(0); + svfloat32_t v332 = svcmla_f32_x(pred_full, zero332, v1168, v304, 90); + svfloat32_t v363 = svadd_f32_x(svptrue_b32(), v360, v288); + svfloat32_t v373 = svmul_f32_x(svptrue_b32(), v360, v1172); + svfloat32_t zero392 = svdup_n_f32(0); + svfloat32_t v392 = svcmla_f32_x(pred_full, zero392, v1175, v362, 90); + svfloat32_t v416 = svadd_f32_x(svptrue_b32(), v413, v289); + svfloat32_t zero437 = svdup_n_f32(0); + svfloat32_t v437 = svcmla_f32_x(pred_full, zero437, v1179, v414, 90); + svfloat32_t v447 = svmul_f32_x(svptrue_b32(), v415, v1181); + svfloat32_t v496 = svadd_f32_x(svptrue_b32(), v492, v494); + svfloat32_t v497 = svsub_f32_x(svptrue_b32(), v492, v494); + svfloat32_t v498 = svadd_f32_x(svptrue_b32(), v493, v495); + svfloat32_t zero521 = svdup_n_f32(0); + svfloat32_t v521 = svcmla_f32_x(pred_full, zero521, v1168, v493, 90); + svfloat32_t v552 = svadd_f32_x(svptrue_b32(), v549, v477); + svfloat32_t v562 = svmul_f32_x(svptrue_b32(), v549, v1172); + svfloat32_t zero581 = svdup_n_f32(0); + svfloat32_t v581 = svcmla_f32_x(pred_full, zero581, v1175, v551, 90); + svfloat32_t v605 = svadd_f32_x(svptrue_b32(), v602, v478); + svfloat32_t zero626 = svdup_n_f32(0); + svfloat32_t v626 = svcmla_f32_x(pred_full, zero626, v1179, v603, 90); + svfloat32_t v636 = svmul_f32_x(svptrue_b32(), v604, v1181); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v307, v290); + svfloat32_t zero339 = svdup_n_f32(0); + svfloat32_t v339 = svcmla_f32_x(pred_full, zero339, v1169, v309, 90); + svfloat32_t v403 = svsub_f32_x(svptrue_b32(), v385, v392); + svfloat32_t v404 = svcmla_f32_x(pred_full, v392, v1176, v359, 90); + svfloat32_t zero423 = svdup_n_f32(0); + svfloat32_t v423 = svcmla_f32_x(pred_full, zero423, v1177, v416, 90); + svfloat32_t v456 = svnmls_f32_x(pred_full, v447, v410, v1180); + svfloat32_t v457 = svmla_f32_x(pred_full, v452, v415, v1181); + svfloat32_t v499 = svadd_f32_x(svptrue_b32(), v496, v479); + svfloat32_t zero528 = svdup_n_f32(0); + svfloat32_t v528 = svcmla_f32_x(pred_full, zero528, v1169, v498, 90); + svfloat32_t v592 = svsub_f32_x(svptrue_b32(), v574, v581); + svfloat32_t v593 = svcmla_f32_x(pred_full, v581, v1176, v548, 90); + svfloat32_t zero612 = svdup_n_f32(0); + svfloat32_t v612 = svcmla_f32_x(pred_full, zero612, v1177, v605, 90); + svfloat32_t v645 = svnmls_f32_x(pred_full, v636, v599, v1180); + svfloat32_t v646 = svmla_f32_x(pred_full, v641, v604, v1181); + svfloat32_t v347 = svmla_f32_x(pred_full, v310, v307, v1166); + svfloat32_t v350 = svsub_f32_x(svptrue_b32(), v332, v339); + svfloat32_t v351 = svcmla_f32_x(pred_full, v339, v1170, v306, 90); + svfloat32_t v400 = svmla_f32_x(pred_full, v373, v363, v1171); + svfloat32_t v453 = svcmla_f32_x(pred_full, v423, v1178, v413, 90); + svfloat32_t v462 = svmla_f32_x(pred_full, v310, v363, v1171); + svfloat32_t v536 = svmla_f32_x(pred_full, v499, v496, v1166); + svfloat32_t v539 = svsub_f32_x(svptrue_b32(), v521, v528); + svfloat32_t v540 = svcmla_f32_x(pred_full, v528, v1170, v495, 90); + svfloat32_t v589 = svmla_f32_x(pred_full, v562, v552, v1171); + svfloat32_t v642 = svcmla_f32_x(pred_full, v612, v1178, v602, 90); + svfloat32_t v651 = svmla_f32_x(pred_full, v499, v552, v1171); + svst1_f64(pred_full, (double *)(v1190), svreinterpret_f64_f32(v310)); + svst1_f64(pred_full, (double *)(v1199), svreinterpret_f64_f32(v499)); + svfloat32_t v348 = svmla_f32_x(pred_full, v347, v308, v1167); + svfloat32_t v349 = svmls_f32_x(pred_full, v347, v308, v1167); + svfloat32_t v401 = svmla_f32_x(pred_full, v400, v361, v1173); + svfloat32_t v402 = svmls_f32_x(pred_full, v400, v361, v1173); + svfloat32_t v454 = svadd_f32_x(svptrue_b32(), v453, v437); + svfloat32_t v455 = svsub_f32_x(svptrue_b32(), v453, v437); + svfloat32_t v463 = svadd_f32_x(svptrue_b32(), v462, v423); + svfloat32_t v464 = svsub_f32_x(svptrue_b32(), v462, v423); + svfloat32_t v537 = svmla_f32_x(pred_full, v536, v497, v1167); + svfloat32_t v538 = svmls_f32_x(pred_full, v536, v497, v1167); + svfloat32_t v590 = svmla_f32_x(pred_full, v589, v550, v1173); + svfloat32_t v591 = svmls_f32_x(pred_full, v589, v550, v1173); + svfloat32_t v643 = svadd_f32_x(svptrue_b32(), v642, v626); + svfloat32_t v644 = svsub_f32_x(svptrue_b32(), v642, v626); + svfloat32_t v652 = svadd_f32_x(svptrue_b32(), v651, v612); + svfloat32_t v653 = svsub_f32_x(svptrue_b32(), v651, v612); + svfloat32_t v352 = svadd_f32_x(svptrue_b32(), v348, v350); + svfloat32_t v353 = svsub_f32_x(svptrue_b32(), v348, v350); + svfloat32_t v354 = svadd_f32_x(svptrue_b32(), v349, v351); + svfloat32_t v355 = svsub_f32_x(svptrue_b32(), v349, v351); + svfloat32_t v405 = svadd_f32_x(svptrue_b32(), v401, v403); + svfloat32_t v406 = svsub_f32_x(svptrue_b32(), v401, v403); + svfloat32_t v407 = svadd_f32_x(svptrue_b32(), v402, v404); + svfloat32_t v408 = svsub_f32_x(svptrue_b32(), v402, v404); + svfloat32_t v458 = svadd_f32_x(svptrue_b32(), v454, v456); + svfloat32_t v459 = svsub_f32_x(svptrue_b32(), v454, v456); + svfloat32_t v460 = svadd_f32_x(svptrue_b32(), v455, v457); + svfloat32_t v461 = svsub_f32_x(svptrue_b32(), v455, v457); + svfloat32_t v541 = svadd_f32_x(svptrue_b32(), v537, v539); + svfloat32_t v542 = svsub_f32_x(svptrue_b32(), v537, v539); + svfloat32_t v543 = svadd_f32_x(svptrue_b32(), v538, v540); + svfloat32_t v544 = svsub_f32_x(svptrue_b32(), v538, v540); + svfloat32_t v594 = svadd_f32_x(svptrue_b32(), v590, v592); + svfloat32_t v595 = svsub_f32_x(svptrue_b32(), v590, v592); + svfloat32_t v596 = svadd_f32_x(svptrue_b32(), v591, v593); + svfloat32_t v597 = svsub_f32_x(svptrue_b32(), v591, v593); + svfloat32_t v647 = svadd_f32_x(svptrue_b32(), v643, v645); + svfloat32_t v648 = svsub_f32_x(svptrue_b32(), v643, v645); + svfloat32_t v649 = svadd_f32_x(svptrue_b32(), v644, v646); + svfloat32_t v650 = svsub_f32_x(svptrue_b32(), v644, v646); + svst1_f64(pred_full, (double *)(v1280), svreinterpret_f64_f32(v464)); + svst1_f64(pred_full, (double *)(v1289), svreinterpret_f64_f32(v653)); + svst1_f64(pred_full, (double *)(v1370), svreinterpret_f64_f32(v463)); + svst1_f64(pred_full, (double *)(v1379), svreinterpret_f64_f32(v652)); + svfloat32_t v465 = svadd_f32_x(svptrue_b32(), v353, v406); + svfloat32_t v468 = svadd_f32_x(svptrue_b32(), v355, v408); + svfloat32_t v471 = svadd_f32_x(svptrue_b32(), v354, v407); + svfloat32_t v474 = svadd_f32_x(svptrue_b32(), v352, v405); + svfloat32_t v654 = svadd_f32_x(svptrue_b32(), v542, v595); + svfloat32_t v657 = svadd_f32_x(svptrue_b32(), v544, v597); + svfloat32_t v660 = svadd_f32_x(svptrue_b32(), v543, v596); + svfloat32_t v663 = svadd_f32_x(svptrue_b32(), v541, v594); + svst1_f64(pred_full, (double *)(v1208), svreinterpret_f64_f32(v353)); + svst1_f64(pred_full, (double *)(v1217), svreinterpret_f64_f32(v542)); + svst1_f64(pred_full, (double *)(v1226), svreinterpret_f64_f32(v355)); + svst1_f64(pred_full, (double *)(v1235), svreinterpret_f64_f32(v544)); + svst1_f64(pred_full, (double *)(v1244), svreinterpret_f64_f32(v354)); + svst1_f64(pred_full, (double *)(v1253), svreinterpret_f64_f32(v543)); + svst1_f64(pred_full, (double *)(v1262), svreinterpret_f64_f32(v352)); + svst1_f64(pred_full, (double *)(v1271), svreinterpret_f64_f32(v541)); + svfloat32_t v466 = svadd_f32_x(svptrue_b32(), v465, v459); + svfloat32_t v467 = svsub_f32_x(svptrue_b32(), v465, v459); + svfloat32_t v469 = svadd_f32_x(svptrue_b32(), v468, v461); + svfloat32_t v470 = svsub_f32_x(svptrue_b32(), v468, v461); + svfloat32_t v472 = svadd_f32_x(svptrue_b32(), v471, v460); + svfloat32_t v473 = svsub_f32_x(svptrue_b32(), v471, v460); + svfloat32_t v475 = svadd_f32_x(svptrue_b32(), v474, v458); + svfloat32_t v476 = svsub_f32_x(svptrue_b32(), v474, v458); + svfloat32_t v655 = svadd_f32_x(svptrue_b32(), v654, v648); + svfloat32_t v656 = svsub_f32_x(svptrue_b32(), v654, v648); + svfloat32_t v658 = svadd_f32_x(svptrue_b32(), v657, v650); + svfloat32_t v659 = svsub_f32_x(svptrue_b32(), v657, v650); + svfloat32_t v661 = svadd_f32_x(svptrue_b32(), v660, v649); + svfloat32_t v662 = svsub_f32_x(svptrue_b32(), v660, v649); + svfloat32_t v664 = svadd_f32_x(svptrue_b32(), v663, v647); + svfloat32_t v665 = svsub_f32_x(svptrue_b32(), v663, v647); + svst1_f64(pred_full, (double *)(v1298), svreinterpret_f64_f32(v467)); + svst1_f64(pred_full, (double *)(v1307), svreinterpret_f64_f32(v656)); + svst1_f64(pred_full, (double *)(v1316), svreinterpret_f64_f32(v470)); + svst1_f64(pred_full, (double *)(v1325), svreinterpret_f64_f32(v659)); + svst1_f64(pred_full, (double *)(v1334), svreinterpret_f64_f32(v473)); + svst1_f64(pred_full, (double *)(v1343), svreinterpret_f64_f32(v662)); + svst1_f64(pred_full, (double *)(v1352), svreinterpret_f64_f32(v476)); + svst1_f64(pred_full, (double *)(v1361), svreinterpret_f64_f32(v665)); + svst1_f64(pred_full, (double *)(v1388), svreinterpret_f64_f32(v466)); + svst1_f64(pred_full, (double *)(v1397), svreinterpret_f64_f32(v655)); + svst1_f64(pred_full, (double *)(v1406), svreinterpret_f64_f32(v469)); + svst1_f64(pred_full, (double *)(v1415), svreinterpret_f64_f32(v658)); + svst1_f64(pred_full, (double *)(v1424), svreinterpret_f64_f32(v472)); + svst1_f64(pred_full, (double *)(v1433), svreinterpret_f64_f32(v661)); + svst1_f64(pred_full, (double *)(v1442), svreinterpret_f64_f32(v475)); + svst1_f64(pred_full, (double *)(v1451), svreinterpret_f64_f32(v664)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu32(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v339 = vld1s_s16(&v5[istride]); + float v774 = 7.0710678118654757e-01F; + float v785 = -7.0710678118654746e-01F; + float v831 = 5.5557023301960229e-01F; + float v845 = -1.9509032201612861e-01F; + float v892 = 9.2387953251128674e-01F; + float v899 = -9.2387953251128685e-01F; + float v902 = 3.8268343236508967e-01F; + float v903 = -3.8268343236508967e-01F; + float v945 = 1.9509032201612833e-01F; + float v948 = -9.8078528040323043e-01F; + float v949 = 9.8078528040323043e-01F; + float v956 = -5.5557023301960218e-01F; + float v959 = 8.3146961230254524e-01F; + float v960 = -8.3146961230254524e-01F; + float v970 = -1.0000000000000000e+00F; + float v971 = 1.0000000000000000e+00F; + float32x2_t v973 = (float32x2_t){v4, v4}; + int16x4_t v20 = vld1s_s16(&v5[0]); + float32x2_t v340 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v339)), 15); + float32x2_t v604 = (float32x2_t){v949, v949}; + float32x2_t v661 = (float32x2_t){v892, v892}; + float32x2_t v665 = (float32x2_t){v903, v902}; + float32x2_t v718 = (float32x2_t){v959, v959}; + float32x2_t v722 = (float32x2_t){v956, v831}; + float32x2_t v729 = (float32x2_t){v845, v845}; + float32x2_t v775 = (float32x2_t){v774, v774}; + float32x2_t v786 = (float32x2_t){v785, v785}; + float32x2_t v790 = (float32x2_t){v971, v970}; + float32x2_t v832 = (float32x2_t){v831, v831}; + float32x2_t v836 = (float32x2_t){v960, v959}; + float32x2_t v843 = (float32x2_t){v948, v948}; + float32x2_t v847 = (float32x2_t){v845, v945}; + float32x2_t v889 = (float32x2_t){v902, v902}; + float32x2_t v893 = (float32x2_t){v899, v892}; + float32x2_t v900 = (float32x2_t){v899, v899}; + float32x2_t v904 = (float32x2_t){v902, v903}; + float32x2_t v946 = (float32x2_t){v945, v945}; + float32x2_t v950 = (float32x2_t){v948, v949}; + float32x2_t v957 = (float32x2_t){v956, v956}; + float32x2_t v961 = (float32x2_t){v959, v960}; + float32x2_t v972 = (float32x2_t){v970, v971}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v26 = vld1s_s16(&v5[istride * 16]); + int16x4_t v34 = vld1s_s16(&v5[istride * 8]); + int16x4_t v40 = vld1s_s16(&v5[istride * 24]); + int16x4_t v59 = vld1s_s16(&v5[istride * 4]); + int16x4_t v65 = vld1s_s16(&v5[istride * 20]); + int16x4_t v73 = vld1s_s16(&v5[istride * 12]); + int16x4_t v79 = vld1s_s16(&v5[istride * 28]); + int16x4_t v137 = vld1s_s16(&v5[istride * 2]); + int16x4_t v143 = vld1s_s16(&v5[istride * 18]); + int16x4_t v151 = vld1s_s16(&v5[istride * 10]); + int16x4_t v157 = vld1s_s16(&v5[istride * 26]); + int16x4_t v176 = vld1s_s16(&v5[istride * 6]); + int16x4_t v182 = vld1s_s16(&v5[istride * 22]); + int16x4_t v190 = vld1s_s16(&v5[istride * 14]); + int16x4_t v196 = vld1s_s16(&v5[istride * 30]); + int16x4_t v345 = vld1s_s16(&v5[istride * 17]); + int16x4_t v353 = vld1s_s16(&v5[istride * 9]); + int16x4_t v359 = vld1s_s16(&v5[istride * 25]); + int16x4_t v378 = vld1s_s16(&v5[istride * 5]); + int16x4_t v384 = vld1s_s16(&v5[istride * 21]); + int16x4_t v392 = vld1s_s16(&v5[istride * 13]); + int16x4_t v398 = vld1s_s16(&v5[istride * 29]); + int16x4_t v456 = vld1s_s16(&v5[istride * 3]); + int16x4_t v462 = vld1s_s16(&v5[istride * 19]); + int16x4_t v470 = vld1s_s16(&v5[istride * 11]); + int16x4_t v476 = vld1s_s16(&v5[istride * 27]); + int16x4_t v495 = vld1s_s16(&v5[istride * 7]); + int16x4_t v501 = vld1s_s16(&v5[istride * 23]); + int16x4_t v509 = vld1s_s16(&v5[istride * 15]); + int16x4_t v515 = vld1s_s16(&v5[istride * 31]); + float32x2_t v667 = vmul_f32(v973, v665); + float32x2_t v724 = vmul_f32(v973, v722); + float32x2_t v792 = vmul_f32(v973, v790); + float32x2_t v838 = vmul_f32(v973, v836); + float32x2_t v849 = vmul_f32(v973, v847); + float32x2_t v895 = vmul_f32(v973, v893); + float32x2_t v906 = vmul_f32(v973, v904); + float32x2_t v952 = vmul_f32(v973, v950); + float32x2_t v963 = vmul_f32(v973, v961); + float32x2_t v974 = vmul_f32(v973, v972); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v60 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v59)), 15); + float32x2_t v66 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v65)), 15); + float32x2_t v74 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v73)), 15); + float32x2_t v80 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v79)), 15); + float32x2_t v138 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v137)), 15); + float32x2_t v144 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v143)), 15); + float32x2_t v152 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v151)), 15); + float32x2_t v158 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v157)), 15); + float32x2_t v177 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v176)), 15); + float32x2_t v183 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v182)), 15); + float32x2_t v191 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v190)), 15); + float32x2_t v197 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v196)), 15); + float32x2_t v346 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v345)), 15); + float32x2_t v354 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v353)), 15); + float32x2_t v360 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v359)), 15); + float32x2_t v379 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v378)), 15); + float32x2_t v385 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v384)), 15); + float32x2_t v393 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v392)), 15); + float32x2_t v399 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v398)), 15); + float32x2_t v457 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v456)), 15); + float32x2_t v463 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v462)), 15); + float32x2_t v471 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v470)), 15); + float32x2_t v477 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v476)), 15); + float32x2_t v496 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v495)), 15); + float32x2_t v502 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v501)), 15); + float32x2_t v510 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v509)), 15); + float32x2_t v516 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v515)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v67 = vadd_f32(v60, v66); + float32x2_t v68 = vsub_f32(v60, v66); + float32x2_t v81 = vadd_f32(v74, v80); + float32x2_t v82 = vsub_f32(v74, v80); + float32x2_t v145 = vadd_f32(v138, v144); + float32x2_t v146 = vsub_f32(v138, v144); + float32x2_t v159 = vadd_f32(v152, v158); + float32x2_t v160 = vsub_f32(v152, v158); + float32x2_t v184 = vadd_f32(v177, v183); + float32x2_t v185 = vsub_f32(v177, v183); + float32x2_t v198 = vadd_f32(v191, v197); + float32x2_t v199 = vsub_f32(v191, v197); + float32x2_t v347 = vadd_f32(v340, v346); + float32x2_t v348 = vsub_f32(v340, v346); + float32x2_t v361 = vadd_f32(v354, v360); + float32x2_t v362 = vsub_f32(v354, v360); + float32x2_t v386 = vadd_f32(v379, v385); + float32x2_t v387 = vsub_f32(v379, v385); + float32x2_t v400 = vadd_f32(v393, v399); + float32x2_t v401 = vsub_f32(v393, v399); + float32x2_t v464 = vadd_f32(v457, v463); + float32x2_t v465 = vsub_f32(v457, v463); + float32x2_t v478 = vadd_f32(v471, v477); + float32x2_t v479 = vsub_f32(v471, v477); + float32x2_t v503 = vadd_f32(v496, v502); + float32x2_t v504 = vsub_f32(v496, v502); + float32x2_t v517 = vadd_f32(v510, v516); + float32x2_t v518 = vsub_f32(v510, v516); + float32x2_t v49 = vrev64_f32(v43); + float32x2_t v51 = vadd_f32(v28, v42); + float32x2_t v52 = vsub_f32(v28, v42); + float32x2_t v83 = vadd_f32(v67, v81); + float32x2_t v84 = vsub_f32(v67, v81); + float32x2_t v99 = vmul_f32(v68, v775); + float32x2_t v110 = vmul_f32(v82, v786); + float32x2_t v166 = vrev64_f32(v160); + float32x2_t v168 = vadd_f32(v145, v159); + float32x2_t v169 = vsub_f32(v145, v159); + float32x2_t v205 = vrev64_f32(v199); + float32x2_t v207 = vadd_f32(v184, v198); + float32x2_t v208 = vsub_f32(v184, v198); + float32x2_t v368 = vrev64_f32(v362); + float32x2_t v370 = vadd_f32(v347, v361); + float32x2_t v371 = vsub_f32(v347, v361); + float32x2_t v402 = vadd_f32(v386, v400); + float32x2_t v403 = vsub_f32(v386, v400); + float32x2_t v418 = vmul_f32(v387, v775); + float32x2_t v429 = vmul_f32(v401, v786); + float32x2_t v485 = vrev64_f32(v479); + float32x2_t v487 = vadd_f32(v464, v478); + float32x2_t v488 = vsub_f32(v464, v478); + float32x2_t v519 = vadd_f32(v503, v517); + float32x2_t v520 = vsub_f32(v503, v517); + float32x2_t v535 = vmul_f32(v504, v775); + float32x2_t v546 = vmul_f32(v518, v786); + float32x2_t v50 = vmul_f32(v49, v792); + float32x2_t v90 = vrev64_f32(v84); + float32x2_t v92 = vadd_f32(v51, v83); + float32x2_t v93 = vsub_f32(v51, v83); + float32x2_t v105 = vrev64_f32(v99); + float32x2_t v116 = vrev64_f32(v110); + float32x2_t v167 = vmul_f32(v166, v792); + float32x2_t v206 = vmul_f32(v205, v792); + float32x2_t v211 = vadd_f32(v168, v207); + float32x2_t v212 = vsub_f32(v168, v207); + float32x2_t v264 = vmul_f32(v169, v775); + float32x2_t v275 = vmul_f32(v208, v786); + float32x2_t v369 = vmul_f32(v368, v792); + float32x2_t v409 = vrev64_f32(v403); + float32x2_t v411 = vadd_f32(v370, v402); + float32x2_t v412 = vsub_f32(v370, v402); + float32x2_t v424 = vrev64_f32(v418); + float32x2_t v435 = vrev64_f32(v429); + float32x2_t v486 = vmul_f32(v485, v792); + float32x2_t v526 = vrev64_f32(v520); + float32x2_t v528 = vadd_f32(v487, v519); + float32x2_t v529 = vsub_f32(v487, v519); + float32x2_t v541 = vrev64_f32(v535); + float32x2_t v552 = vrev64_f32(v546); + float32x2_t v53 = vsub_f32(v29, v50); + float32x2_t v54 = vadd_f32(v29, v50); + float32x2_t v91 = vmul_f32(v90, v792); + float32x2_t v106 = vmul_f32(v105, v974); + float32x2_t v117 = vmul_f32(v116, v792); + float32x2_t v170 = vsub_f32(v146, v167); + float32x2_t v171 = vadd_f32(v146, v167); + float32x2_t v209 = vsub_f32(v185, v206); + float32x2_t v210 = vadd_f32(v185, v206); + float32x2_t v218 = vrev64_f32(v212); + float32x2_t v220 = vadd_f32(v92, v211); + float32x2_t v221 = vsub_f32(v92, v211); + float32x2_t v270 = vrev64_f32(v264); + float32x2_t v281 = vrev64_f32(v275); + float32x2_t v372 = vsub_f32(v348, v369); + float32x2_t v373 = vadd_f32(v348, v369); + float32x2_t v410 = vmul_f32(v409, v792); + float32x2_t v425 = vmul_f32(v424, v974); + float32x2_t v436 = vmul_f32(v435, v792); + float32x2_t v489 = vsub_f32(v465, v486); + float32x2_t v490 = vadd_f32(v465, v486); + float32x2_t v527 = vmul_f32(v526, v792); + float32x2_t v542 = vmul_f32(v541, v974); + float32x2_t v553 = vmul_f32(v552, v792); + float32x2_t v569 = vadd_f32(v411, v528); + float32x2_t v570 = vsub_f32(v411, v528); + float32x2_t v776 = vmul_f32(v412, v775); + float32x2_t v787 = vmul_f32(v529, v786); + float32x2_t v94 = vsub_f32(v52, v91); + float32x2_t v95 = vadd_f32(v52, v91); + float32x2_t v118 = vadd_f32(v99, v106); + float32x2_t v119 = vadd_f32(v110, v117); + float32x2_t v219 = vmul_f32(v218, v792); + float32x2_t v227 = vmul_f32(v170, v661); + float32x2_t v233 = vrev64_f32(v170); + float32x2_t v238 = vmul_f32(v209, v889); + float32x2_t v244 = vrev64_f32(v209); + float32x2_t v271 = vmul_f32(v270, v974); + float32x2_t v282 = vmul_f32(v281, v792); + float32x2_t v301 = vmul_f32(v171, v889); + float32x2_t v307 = vrev64_f32(v171); + float32x2_t v312 = vmul_f32(v210, v900); + float32x2_t v318 = vrev64_f32(v210); + float32x2_t v413 = vsub_f32(v371, v410); + float32x2_t v414 = vadd_f32(v371, v410); + float32x2_t v437 = vadd_f32(v418, v425); + float32x2_t v438 = vadd_f32(v429, v436); + float32x2_t v530 = vsub_f32(v488, v527); + float32x2_t v531 = vadd_f32(v488, v527); + float32x2_t v554 = vadd_f32(v535, v542); + float32x2_t v555 = vadd_f32(v546, v553); + float32x2_t v576 = vrev64_f32(v570); + float32x2_t v578 = vadd_f32(v220, v569); + float32x2_t v579 = vsub_f32(v220, v569); + float32x2_t v782 = vrev64_f32(v776); + float32x2_t v793 = vrev64_f32(v787); + float32x2_t v120 = vadd_f32(v118, v119); + float32x2_t v121 = vsub_f32(v119, v118); + float32x2_t v222 = vsub_f32(v93, v219); + float32x2_t v223 = vadd_f32(v93, v219); + float32x2_t v283 = vadd_f32(v264, v271); + float32x2_t v284 = vadd_f32(v275, v282); + float32x2_t v439 = vadd_f32(v437, v438); + float32x2_t v440 = vsub_f32(v438, v437); + float32x2_t v556 = vadd_f32(v554, v555); + float32x2_t v557 = vsub_f32(v555, v554); + float32x2_t v577 = vmul_f32(v576, v792); + v6[0] = v578; + v6[ostride * 16] = v579; + float32x2_t v662 = vmul_f32(v413, v661); + float32x2_t v668 = vrev64_f32(v413); + float32x2_t v673 = vmul_f32(v530, v889); + float32x2_t v679 = vrev64_f32(v530); + float32x2_t v783 = vmul_f32(v782, v974); + float32x2_t v794 = vmul_f32(v793, v792); + float32x2_t v890 = vmul_f32(v414, v889); + float32x2_t v896 = vrev64_f32(v414); + float32x2_t v901 = vmul_f32(v531, v900); + float32x2_t v907 = vrev64_f32(v531); + float32x2_t v127 = vrev64_f32(v121); + float32x2_t v129 = vadd_f32(v53, v120); + float32x2_t v130 = vsub_f32(v53, v120); + float32x2_t v246 = vfma_f32(v227, v233, v667); + float32x2_t v247 = vfma_f32(v238, v244, v895); + float32x2_t v285 = vadd_f32(v283, v284); + float32x2_t v286 = vsub_f32(v284, v283); + float32x2_t v320 = vfma_f32(v301, v307, v895); + float32x2_t v321 = vfma_f32(v312, v318, v906); + float32x2_t v446 = vrev64_f32(v440); + float32x2_t v448 = vadd_f32(v372, v439); + float32x2_t v449 = vsub_f32(v372, v439); + float32x2_t v563 = vrev64_f32(v557); + float32x2_t v565 = vadd_f32(v489, v556); + float32x2_t v566 = vsub_f32(v489, v556); + float32x2_t v580 = vsub_f32(v221, v577); + float32x2_t v581 = vadd_f32(v221, v577); + float32x2_t v795 = vadd_f32(v776, v783); + float32x2_t v796 = vadd_f32(v787, v794); + float32x2_t v128 = vmul_f32(v127, v974); + float32x2_t v248 = vadd_f32(v246, v247); + float32x2_t v249 = vsub_f32(v247, v246); + float32x2_t v292 = vrev64_f32(v286); + float32x2_t v294 = vadd_f32(v94, v285); + float32x2_t v295 = vsub_f32(v94, v285); + float32x2_t v322 = vadd_f32(v320, v321); + float32x2_t v323 = vsub_f32(v321, v320); + float32x2_t v447 = vmul_f32(v446, v974); + float32x2_t v564 = vmul_f32(v563, v974); + v6[ostride * 8] = v580; + v6[ostride * 24] = v581; + float32x2_t v605 = vmul_f32(v448, v604); + float32x2_t v611 = vrev64_f32(v448); + float32x2_t v616 = vmul_f32(v565, v718); + float32x2_t v622 = vrev64_f32(v565); + float32x2_t v681 = vfma_f32(v662, v668, v667); + float32x2_t v682 = vfma_f32(v673, v679, v895); + float32x2_t v797 = vadd_f32(v795, v796); + float32x2_t v798 = vsub_f32(v796, v795); + float32x2_t v833 = vmul_f32(v449, v832); + float32x2_t v839 = vrev64_f32(v449); + float32x2_t v844 = vmul_f32(v566, v843); + float32x2_t v850 = vrev64_f32(v566); + float32x2_t v909 = vfma_f32(v890, v896, v895); + float32x2_t v910 = vfma_f32(v901, v907, v906); + float32x2_t v131 = vsub_f32(v54, v128); + float32x2_t v132 = vadd_f32(v54, v128); + float32x2_t v255 = vrev64_f32(v249); + float32x2_t v257 = vadd_f32(v129, v248); + float32x2_t v258 = vsub_f32(v129, v248); + float32x2_t v293 = vmul_f32(v292, v974); + float32x2_t v329 = vrev64_f32(v323); + float32x2_t v450 = vsub_f32(v373, v447); + float32x2_t v451 = vadd_f32(v373, v447); + float32x2_t v567 = vsub_f32(v490, v564); + float32x2_t v568 = vadd_f32(v490, v564); + float32x2_t v683 = vadd_f32(v681, v682); + float32x2_t v684 = vsub_f32(v682, v681); + float32x2_t v804 = vrev64_f32(v798); + float32x2_t v806 = vadd_f32(v222, v797); + float32x2_t v807 = vsub_f32(v222, v797); + float32x2_t v911 = vadd_f32(v909, v910); + float32x2_t v912 = vsub_f32(v910, v909); + float32x2_t v256 = vmul_f32(v255, v974); + float32x2_t v296 = vsub_f32(v95, v293); + float32x2_t v297 = vadd_f32(v95, v293); + float32x2_t v330 = vmul_f32(v329, v974); + float32x2_t v331 = vadd_f32(v131, v322); + float32x2_t v332 = vsub_f32(v131, v322); + float32x2_t v624 = vfma_f32(v605, v611, v849); + float32x2_t v625 = vfma_f32(v616, v622, v724); + float32x2_t v690 = vrev64_f32(v684); + float32x2_t v692 = vadd_f32(v294, v683); + float32x2_t v693 = vsub_f32(v294, v683); + float32x2_t v719 = vmul_f32(v450, v718); + float32x2_t v725 = vrev64_f32(v450); + float32x2_t v730 = vmul_f32(v567, v729); + float32x2_t v736 = vrev64_f32(v567); + float32x2_t v805 = vmul_f32(v804, v974); + v6[ostride * 4] = v806; + v6[ostride * 20] = v807; + float32x2_t v852 = vfma_f32(v833, v839, v838); + float32x2_t v853 = vfma_f32(v844, v850, v849); + float32x2_t v918 = vrev64_f32(v912); + float32x2_t v947 = vmul_f32(v451, v946); + float32x2_t v953 = vrev64_f32(v451); + float32x2_t v958 = vmul_f32(v568, v957); + float32x2_t v964 = vrev64_f32(v568); + float32x2_t v259 = vsub_f32(v130, v256); + float32x2_t v260 = vadd_f32(v130, v256); + float32x2_t v333 = vsub_f32(v132, v330); + float32x2_t v334 = vadd_f32(v132, v330); + float32x2_t v626 = vadd_f32(v624, v625); + float32x2_t v627 = vsub_f32(v625, v624); + float32x2_t v691 = vmul_f32(v690, v974); + v6[ostride * 2] = v692; + v6[ostride * 18] = v693; + float32x2_t v808 = vsub_f32(v223, v805); + float32x2_t v809 = vadd_f32(v223, v805); + float32x2_t v854 = vadd_f32(v852, v853); + float32x2_t v855 = vsub_f32(v853, v852); + float32x2_t v919 = vmul_f32(v918, v974); + float32x2_t v920 = vadd_f32(v296, v911); + float32x2_t v921 = vsub_f32(v296, v911); + float32x2_t v633 = vrev64_f32(v627); + float32x2_t v635 = vadd_f32(v257, v626); + float32x2_t v636 = vsub_f32(v257, v626); + float32x2_t v694 = vsub_f32(v295, v691); + float32x2_t v695 = vadd_f32(v295, v691); + float32x2_t v738 = vfma_f32(v719, v725, v724); + float32x2_t v739 = vfma_f32(v730, v736, v952); + v6[ostride * 12] = v808; + v6[ostride * 28] = v809; + float32x2_t v861 = vrev64_f32(v855); + float32x2_t v863 = vadd_f32(v259, v854); + float32x2_t v864 = vsub_f32(v259, v854); + float32x2_t v922 = vsub_f32(v297, v919); + float32x2_t v923 = vadd_f32(v297, v919); + v6[ostride * 6] = v920; + v6[ostride * 22] = v921; + float32x2_t v966 = vfma_f32(v947, v953, v952); + float32x2_t v967 = vfma_f32(v958, v964, v963); + float32x2_t v634 = vmul_f32(v633, v974); + v6[ostride] = v635; + v6[ostride * 17] = v636; + v6[ostride * 10] = v694; + v6[ostride * 26] = v695; + float32x2_t v740 = vadd_f32(v738, v739); + float32x2_t v741 = vsub_f32(v739, v738); + float32x2_t v862 = vmul_f32(v861, v974); + v6[ostride * 5] = v863; + v6[ostride * 21] = v864; + v6[ostride * 14] = v922; + v6[ostride * 30] = v923; + float32x2_t v968 = vadd_f32(v966, v967); + float32x2_t v969 = vsub_f32(v967, v966); + float32x2_t v637 = vsub_f32(v258, v634); + float32x2_t v638 = vadd_f32(v258, v634); + float32x2_t v747 = vrev64_f32(v741); + float32x2_t v749 = vadd_f32(v331, v740); + float32x2_t v750 = vsub_f32(v331, v740); + float32x2_t v865 = vsub_f32(v260, v862); + float32x2_t v866 = vadd_f32(v260, v862); + float32x2_t v975 = vrev64_f32(v969); + float32x2_t v977 = vadd_f32(v333, v968); + float32x2_t v978 = vsub_f32(v333, v968); + v6[ostride * 9] = v637; + v6[ostride * 25] = v638; + float32x2_t v748 = vmul_f32(v747, v974); + v6[ostride * 3] = v749; + v6[ostride * 19] = v750; + v6[ostride * 13] = v865; + v6[ostride * 29] = v866; + float32x2_t v976 = vmul_f32(v975, v974); + v6[ostride * 7] = v977; + v6[ostride * 23] = v978; + float32x2_t v751 = vsub_f32(v332, v748); + float32x2_t v752 = vadd_f32(v332, v748); + float32x2_t v979 = vsub_f32(v334, v976); + float32x2_t v980 = vadd_f32(v334, v976); + v6[ostride * 11] = v751; + v6[ostride * 27] = v752; + v6[ostride * 15] = v979; + v6[ostride * 31] = v980; + v5 += 1 * idist; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu32(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v835 = -1.9509032201612819e-01F; + float v890 = 7.0710678118654757e-01F; + float v902 = -7.0710678118654746e-01F; + float v907 = -1.0000000000000000e+00F; + float v957 = 5.5557023301960229e-01F; + float v962 = 8.3146961230254524e-01F; + float v969 = -9.8078528040323043e-01F; + float v1024 = 3.8268343236508984e-01F; + float v1029 = 9.2387953251128674e-01F; + float v1036 = -9.2387953251128685e-01F; + float v1041 = -3.8268343236508967e-01F; + float v1091 = 1.9509032201612833e-01F; + float v1096 = 9.8078528040323043e-01F; + float v1103 = -5.5557023301960218e-01F; + float v1108 = -8.3146961230254524e-01F; + const int32_t *v1333 = &v5[v0]; + float32x2_t *v1534 = &v6[v2]; + int64_t v27 = v0 * 16; + int64_t v37 = v0 * 8; + int64_t v45 = v0 * 24; + int64_t v66 = v0 * 4; + int64_t v74 = v0 * 20; + int64_t v84 = v0 * 12; + int64_t v92 = v0 * 28; + int64_t v154 = v0 * 2; + int64_t v162 = v0 * 18; + int64_t v172 = v0 * 10; + int64_t v180 = v0 * 26; + int64_t v201 = v0 * 6; + int64_t v209 = v0 * 22; + int64_t v219 = v0 * 14; + int64_t v227 = v0 * 30; + int64_t v386 = v0 * 17; + int64_t v396 = v0 * 9; + int64_t v404 = v0 * 25; + int64_t v425 = v0 * 5; + int64_t v433 = v0 * 21; + int64_t v443 = v0 * 13; + int64_t v451 = v0 * 29; + int64_t v513 = v0 * 3; + int64_t v521 = v0 * 19; + int64_t v531 = v0 * 11; + int64_t v539 = v0 * 27; + int64_t v560 = v0 * 7; + int64_t v568 = v0 * 23; + int64_t v578 = v0 * 15; + int64_t v586 = v0 * 31; + int64_t v668 = v2 * 8; + int64_t v675 = v2 * 16; + int64_t v682 = v2 * 24; + int64_t v735 = v2 * 9; + int64_t v742 = v2 * 17; + int64_t v749 = v2 * 25; + float v764 = v4 * v1024; + int64_t v795 = v2 * 2; + int64_t v802 = v2 * 10; + int64_t v809 = v2 * 18; + int64_t v816 = v2 * 26; + float v831 = v4 * v957; + int64_t v862 = v2 * 3; + int64_t v869 = v2 * 11; + int64_t v876 = v2 * 19; + int64_t v883 = v2 * 27; + float v910 = v4 * v907; + int64_t v929 = v2 * 4; + int64_t v936 = v2 * 12; + int64_t v943 = v2 * 20; + int64_t v950 = v2 * 28; + float v965 = v4 * v962; + float v977 = v4 * v1091; + int64_t v996 = v2 * 5; + int64_t v1003 = v2 * 13; + int64_t v1010 = v2 * 21; + int64_t v1017 = v2 * 29; + float v1032 = v4 * v1029; + float v1044 = v4 * v1041; + int64_t v1063 = v2 * 6; + int64_t v1070 = v2 * 14; + int64_t v1077 = v2 * 22; + int64_t v1084 = v2 * 30; + float v1099 = v4 * v1096; + float v1111 = v4 * v1108; + int64_t v1130 = v2 * 7; + int64_t v1137 = v2 * 15; + int64_t v1144 = v2 * 23; + int64_t v1151 = v2 * 31; + const int32_t *v1164 = &v5[0]; + svint64_t v1477 = svindex_s64(0, v1); + float32x2_t *v1493 = &v6[0]; + svfloat32_t v1523 = svdup_n_f32(v1096); + svfloat32_t v1564 = svdup_n_f32(v1029); + svfloat32_t v1605 = svdup_n_f32(v962); + svfloat32_t v1607 = svdup_n_f32(v835); + svfloat32_t v1646 = svdup_n_f32(v890); + svfloat32_t v1648 = svdup_n_f32(v902); + svfloat32_t v1687 = svdup_n_f32(v957); + svfloat32_t v1689 = svdup_n_f32(v969); + svfloat32_t v1728 = svdup_n_f32(v1024); + svfloat32_t v1730 = svdup_n_f32(v1036); + svfloat32_t v1769 = svdup_n_f32(v1091); + svfloat32_t v1771 = svdup_n_f32(v1103); + svfloat32_t v1773 = svdup_n_f32(v4); + svint16_t v1166 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1164), v1477)); + const int32_t *v1173 = &v5[v27]; + const int32_t *v1182 = &v5[v37]; + const int32_t *v1191 = &v5[v45]; + const int32_t *v1201 = &v5[v66]; + const int32_t *v1210 = &v5[v74]; + const int32_t *v1219 = &v5[v84]; + const int32_t *v1228 = &v5[v92]; + const int32_t *v1243 = &v5[v154]; + const int32_t *v1252 = &v5[v162]; + const int32_t *v1261 = &v5[v172]; + const int32_t *v1270 = &v5[v180]; + const int32_t *v1280 = &v5[v201]; + const int32_t *v1289 = &v5[v209]; + const int32_t *v1298 = &v5[v219]; + const int32_t *v1307 = &v5[v227]; + svint16_t v1335 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1333), v1477)); + const int32_t *v1342 = &v5[v386]; + const int32_t *v1351 = &v5[v396]; + const int32_t *v1360 = &v5[v404]; + const int32_t *v1370 = &v5[v425]; + const int32_t *v1379 = &v5[v433]; + const int32_t *v1388 = &v5[v443]; + const int32_t *v1397 = &v5[v451]; + const int32_t *v1412 = &v5[v513]; + const int32_t *v1421 = &v5[v521]; + const int32_t *v1430 = &v5[v531]; + const int32_t *v1439 = &v5[v539]; + const int32_t *v1449 = &v5[v560]; + const int32_t *v1458 = &v5[v568]; + const int32_t *v1467 = &v5[v578]; + const int32_t *v1476 = &v5[v586]; + float32x2_t *v1502 = &v6[v668]; + float32x2_t *v1511 = &v6[v675]; + float32x2_t *v1520 = &v6[v682]; + float32x2_t *v1543 = &v6[v735]; + float32x2_t *v1552 = &v6[v742]; + float32x2_t *v1561 = &v6[v749]; + svfloat32_t v1565 = svdup_n_f32(v764); + float32x2_t *v1575 = &v6[v795]; + float32x2_t *v1584 = &v6[v802]; + float32x2_t *v1593 = &v6[v809]; + float32x2_t *v1602 = &v6[v816]; + svfloat32_t v1606 = svdup_n_f32(v831); + float32x2_t *v1616 = &v6[v862]; + float32x2_t *v1625 = &v6[v869]; + float32x2_t *v1634 = &v6[v876]; + float32x2_t *v1643 = &v6[v883]; + svfloat32_t v1649 = svdup_n_f32(v910); + float32x2_t *v1657 = &v6[v929]; + float32x2_t *v1666 = &v6[v936]; + float32x2_t *v1675 = &v6[v943]; + float32x2_t *v1684 = &v6[v950]; + svfloat32_t v1688 = svdup_n_f32(v965); + svfloat32_t v1690 = svdup_n_f32(v977); + float32x2_t *v1698 = &v6[v996]; + float32x2_t *v1707 = &v6[v1003]; + float32x2_t *v1716 = &v6[v1010]; + float32x2_t *v1725 = &v6[v1017]; + svfloat32_t v1729 = svdup_n_f32(v1032); + svfloat32_t v1731 = svdup_n_f32(v1044); + float32x2_t *v1739 = &v6[v1063]; + float32x2_t *v1748 = &v6[v1070]; + float32x2_t *v1757 = &v6[v1077]; + float32x2_t *v1766 = &v6[v1084]; + svfloat32_t v1770 = svdup_n_f32(v1099); + svfloat32_t v1772 = svdup_n_f32(v1111); + float32x2_t *v1780 = &v6[v1130]; + float32x2_t *v1789 = &v6[v1137]; + float32x2_t *v1798 = &v6[v1144]; + float32x2_t *v1807 = &v6[v1151]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1166, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v384 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1335, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svint16_t v1175 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1173), v1477)); + svint16_t v1184 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1182), v1477)); + svint16_t v1193 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1191), v1477)); + svint16_t v1203 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1201), v1477)); + svint16_t v1212 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1210), v1477)); + svint16_t v1221 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1219), v1477)); + svint16_t v1230 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1228), v1477)); + svint16_t v1245 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1243), v1477)); + svint16_t v1254 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1252), v1477)); + svint16_t v1263 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1261), v1477)); + svint16_t v1272 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1270), v1477)); + svint16_t v1282 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1280), v1477)); + svint16_t v1291 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1289), v1477)); + svint16_t v1300 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1298), v1477)); + svint16_t v1309 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1307), v1477)); + svint16_t v1344 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1342), v1477)); + svint16_t v1353 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1351), v1477)); + svint16_t v1362 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1360), v1477)); + svint16_t v1372 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1370), v1477)); + svint16_t v1381 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1379), v1477)); + svint16_t v1390 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1388), v1477)); + svint16_t v1399 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1397), v1477)); + svint16_t v1414 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1412), v1477)); + svint16_t v1423 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1421), v1477)); + svint16_t v1432 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1430), v1477)); + svint16_t v1441 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1439), v1477)); + svint16_t v1451 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1449), v1477)); + svint16_t v1460 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1458), v1477)); + svint16_t v1469 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1467), v1477)); + svint16_t v1478 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1476), v1477)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1175, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1184, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v51 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1193, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v72 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1203, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v80 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1212, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v90 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1221, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v98 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1230, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v160 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1245, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v168 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1254, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v178 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1263, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v186 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1272, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v207 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1282, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v215 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1291, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v225 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1300, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v233 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1309, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v392 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1344, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v402 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1353, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v410 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1362, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v431 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1372, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v439 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1381, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v449 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1390, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v457 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1399, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v519 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1414, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v527 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1423, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v537 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1432, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v545 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1441, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v566 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1451, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v574 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1460, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v584 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1469, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v592 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1478, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v81 = svadd_f32_x(svptrue_b32(), v72, v80); + svfloat32_t v82 = svsub_f32_x(svptrue_b32(), v72, v80); + svfloat32_t v99 = svadd_f32_x(svptrue_b32(), v90, v98); + svfloat32_t v100 = svsub_f32_x(svptrue_b32(), v90, v98); + svfloat32_t v169 = svadd_f32_x(svptrue_b32(), v160, v168); + svfloat32_t v170 = svsub_f32_x(svptrue_b32(), v160, v168); + svfloat32_t v187 = svadd_f32_x(svptrue_b32(), v178, v186); + svfloat32_t v188 = svsub_f32_x(svptrue_b32(), v178, v186); + svfloat32_t v216 = svadd_f32_x(svptrue_b32(), v207, v215); + svfloat32_t v217 = svsub_f32_x(svptrue_b32(), v207, v215); + svfloat32_t v234 = svadd_f32_x(svptrue_b32(), v225, v233); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v225, v233); + svfloat32_t v393 = svadd_f32_x(svptrue_b32(), v384, v392); + svfloat32_t v394 = svsub_f32_x(svptrue_b32(), v384, v392); + svfloat32_t v411 = svadd_f32_x(svptrue_b32(), v402, v410); + svfloat32_t v412 = svsub_f32_x(svptrue_b32(), v402, v410); + svfloat32_t v440 = svadd_f32_x(svptrue_b32(), v431, v439); + svfloat32_t v441 = svsub_f32_x(svptrue_b32(), v431, v439); + svfloat32_t v458 = svadd_f32_x(svptrue_b32(), v449, v457); + svfloat32_t v459 = svsub_f32_x(svptrue_b32(), v449, v457); + svfloat32_t v528 = svadd_f32_x(svptrue_b32(), v519, v527); + svfloat32_t v529 = svsub_f32_x(svptrue_b32(), v519, v527); + svfloat32_t v546 = svadd_f32_x(svptrue_b32(), v537, v545); + svfloat32_t v547 = svsub_f32_x(svptrue_b32(), v537, v545); + svfloat32_t v575 = svadd_f32_x(svptrue_b32(), v566, v574); + svfloat32_t v576 = svsub_f32_x(svptrue_b32(), v566, v574); + svfloat32_t v593 = svadd_f32_x(svptrue_b32(), v584, v592); + svfloat32_t v594 = svsub_f32_x(svptrue_b32(), v584, v592); + svfloat32_t zero60 = svdup_n_f32(0); + svfloat32_t v60 = svcmla_f32_x(pred_full, zero60, v1649, v53, 90); + svfloat32_t v61 = svadd_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v62 = svsub_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v101 = svadd_f32_x(svptrue_b32(), v81, v99); + svfloat32_t v102 = svsub_f32_x(svptrue_b32(), v81, v99); + svfloat32_t v118 = svmul_f32_x(svptrue_b32(), v82, v1646); + svfloat32_t v130 = svmul_f32_x(svptrue_b32(), v100, v1648); + svfloat32_t zero195 = svdup_n_f32(0); + svfloat32_t v195 = svcmla_f32_x(pred_full, zero195, v1649, v188, 90); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v169, v187); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v169, v187); + svfloat32_t zero242 = svdup_n_f32(0); + svfloat32_t v242 = svcmla_f32_x(pred_full, zero242, v1649, v235, 90); + svfloat32_t v243 = svadd_f32_x(svptrue_b32(), v216, v234); + svfloat32_t v244 = svsub_f32_x(svptrue_b32(), v216, v234); + svfloat32_t zero419 = svdup_n_f32(0); + svfloat32_t v419 = svcmla_f32_x(pred_full, zero419, v1649, v412, 90); + svfloat32_t v420 = svadd_f32_x(svptrue_b32(), v393, v411); + svfloat32_t v421 = svsub_f32_x(svptrue_b32(), v393, v411); + svfloat32_t v460 = svadd_f32_x(svptrue_b32(), v440, v458); + svfloat32_t v461 = svsub_f32_x(svptrue_b32(), v440, v458); + svfloat32_t v477 = svmul_f32_x(svptrue_b32(), v441, v1646); + svfloat32_t v489 = svmul_f32_x(svptrue_b32(), v459, v1648); + svfloat32_t zero554 = svdup_n_f32(0); + svfloat32_t v554 = svcmla_f32_x(pred_full, zero554, v1649, v547, 90); + svfloat32_t v555 = svadd_f32_x(svptrue_b32(), v528, v546); + svfloat32_t v556 = svsub_f32_x(svptrue_b32(), v528, v546); + svfloat32_t v595 = svadd_f32_x(svptrue_b32(), v575, v593); + svfloat32_t v596 = svsub_f32_x(svptrue_b32(), v575, v593); + svfloat32_t v612 = svmul_f32_x(svptrue_b32(), v576, v1646); + svfloat32_t v624 = svmul_f32_x(svptrue_b32(), v594, v1648); + svfloat32_t v63 = svsub_f32_x(svptrue_b32(), v35, v60); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v35, v60); + svfloat32_t zero109 = svdup_n_f32(0); + svfloat32_t v109 = svcmla_f32_x(pred_full, zero109, v1649, v102, 90); + svfloat32_t v110 = svadd_f32_x(svptrue_b32(), v61, v101); + svfloat32_t v111 = svsub_f32_x(svptrue_b32(), v61, v101); + svfloat32_t v198 = svsub_f32_x(svptrue_b32(), v170, v195); + svfloat32_t v199 = svadd_f32_x(svptrue_b32(), v170, v195); + svfloat32_t v245 = svsub_f32_x(svptrue_b32(), v217, v242); + svfloat32_t v246 = svadd_f32_x(svptrue_b32(), v217, v242); + svfloat32_t v247 = svadd_f32_x(svptrue_b32(), v196, v243); + svfloat32_t v248 = svsub_f32_x(svptrue_b32(), v196, v243); + svfloat32_t v303 = svmul_f32_x(svptrue_b32(), v197, v1646); + svfloat32_t v315 = svmul_f32_x(svptrue_b32(), v244, v1648); + svfloat32_t v422 = svsub_f32_x(svptrue_b32(), v394, v419); + svfloat32_t v423 = svadd_f32_x(svptrue_b32(), v394, v419); + svfloat32_t zero468 = svdup_n_f32(0); + svfloat32_t v468 = svcmla_f32_x(pred_full, zero468, v1649, v461, 90); + svfloat32_t v469 = svadd_f32_x(svptrue_b32(), v420, v460); + svfloat32_t v470 = svsub_f32_x(svptrue_b32(), v420, v460); + svfloat32_t v557 = svsub_f32_x(svptrue_b32(), v529, v554); + svfloat32_t v558 = svadd_f32_x(svptrue_b32(), v529, v554); + svfloat32_t zero603 = svdup_n_f32(0); + svfloat32_t v603 = svcmla_f32_x(pred_full, zero603, v1649, v596, 90); + svfloat32_t v604 = svadd_f32_x(svptrue_b32(), v555, v595); + svfloat32_t v605 = svsub_f32_x(svptrue_b32(), v555, v595); + svfloat32_t v112 = svsub_f32_x(svptrue_b32(), v62, v109); + svfloat32_t v113 = svadd_f32_x(svptrue_b32(), v62, v109); + svfloat32_t v138 = svcmla_f32_x(pred_full, v118, v1773, v118, 90); + svfloat32_t v139 = svcmla_f32_x(pred_full, v130, v1649, v130, 90); + svfloat32_t zero255 = svdup_n_f32(0); + svfloat32_t v255 = svcmla_f32_x(pred_full, zero255, v1649, v248, 90); + svfloat32_t v256 = svadd_f32_x(svptrue_b32(), v110, v247); + svfloat32_t v257 = svsub_f32_x(svptrue_b32(), v110, v247); + svfloat32_t v264 = svmul_f32_x(svptrue_b32(), v198, v1564); + svfloat32_t v276 = svmul_f32_x(svptrue_b32(), v245, v1728); + svfloat32_t v342 = svmul_f32_x(svptrue_b32(), v199, v1728); + svfloat32_t v354 = svmul_f32_x(svptrue_b32(), v246, v1730); + svfloat32_t v471 = svsub_f32_x(svptrue_b32(), v421, v468); + svfloat32_t v472 = svadd_f32_x(svptrue_b32(), v421, v468); + svfloat32_t v497 = svcmla_f32_x(pred_full, v477, v1773, v477, 90); + svfloat32_t v498 = svcmla_f32_x(pred_full, v489, v1649, v489, 90); + svfloat32_t v606 = svsub_f32_x(svptrue_b32(), v556, v603); + svfloat32_t v607 = svadd_f32_x(svptrue_b32(), v556, v603); + svfloat32_t v632 = svcmla_f32_x(pred_full, v612, v1773, v612, 90); + svfloat32_t v633 = svcmla_f32_x(pred_full, v624, v1649, v624, 90); + svfloat32_t v647 = svadd_f32_x(svptrue_b32(), v469, v604); + svfloat32_t v648 = svsub_f32_x(svptrue_b32(), v469, v604); + svfloat32_t v893 = svmul_f32_x(svptrue_b32(), v470, v1646); + svfloat32_t v905 = svmul_f32_x(svptrue_b32(), v605, v1648); + svfloat32_t v140 = svadd_f32_x(svptrue_b32(), v138, v139); + svfloat32_t v141 = svsub_f32_x(svptrue_b32(), v139, v138); + svfloat32_t v258 = svsub_f32_x(svptrue_b32(), v111, v255); + svfloat32_t v259 = svadd_f32_x(svptrue_b32(), v111, v255); + svfloat32_t v284 = svcmla_f32_x(pred_full, v264, v1565, v198, 90); + svfloat32_t v285 = svcmla_f32_x(pred_full, v276, v1729, v245, 90); + svfloat32_t v323 = svcmla_f32_x(pred_full, v303, v1773, v303, 90); + svfloat32_t v324 = svcmla_f32_x(pred_full, v315, v1649, v315, 90); + svfloat32_t v362 = svcmla_f32_x(pred_full, v342, v1729, v199, 90); + svfloat32_t v363 = svcmla_f32_x(pred_full, v354, v1731, v246, 90); + svfloat32_t v499 = svadd_f32_x(svptrue_b32(), v497, v498); + svfloat32_t v500 = svsub_f32_x(svptrue_b32(), v498, v497); + svfloat32_t v634 = svadd_f32_x(svptrue_b32(), v632, v633); + svfloat32_t v635 = svsub_f32_x(svptrue_b32(), v633, v632); + svfloat32_t zero655 = svdup_n_f32(0); + svfloat32_t v655 = svcmla_f32_x(pred_full, zero655, v1649, v648, 90); + svfloat32_t v656 = svadd_f32_x(svptrue_b32(), v256, v647); + svfloat32_t v657 = svsub_f32_x(svptrue_b32(), v256, v647); + svfloat32_t v759 = svmul_f32_x(svptrue_b32(), v471, v1564); + svfloat32_t v771 = svmul_f32_x(svptrue_b32(), v606, v1728); + svfloat32_t v1027 = svmul_f32_x(svptrue_b32(), v472, v1728); + svfloat32_t v1039 = svmul_f32_x(svptrue_b32(), v607, v1730); + svfloat32_t zero148 = svdup_n_f32(0); + svfloat32_t v148 = svcmla_f32_x(pred_full, zero148, v1773, v141, 90); + svfloat32_t v149 = svadd_f32_x(svptrue_b32(), v63, v140); + svfloat32_t v150 = svsub_f32_x(svptrue_b32(), v63, v140); + svfloat32_t v286 = svadd_f32_x(svptrue_b32(), v284, v285); + svfloat32_t v287 = svsub_f32_x(svptrue_b32(), v285, v284); + svfloat32_t v325 = svadd_f32_x(svptrue_b32(), v323, v324); + svfloat32_t v326 = svsub_f32_x(svptrue_b32(), v324, v323); + svfloat32_t v364 = svadd_f32_x(svptrue_b32(), v362, v363); + svfloat32_t v365 = svsub_f32_x(svptrue_b32(), v363, v362); + svfloat32_t zero507 = svdup_n_f32(0); + svfloat32_t v507 = svcmla_f32_x(pred_full, zero507, v1773, v500, 90); + svfloat32_t v508 = svadd_f32_x(svptrue_b32(), v422, v499); + svfloat32_t v509 = svsub_f32_x(svptrue_b32(), v422, v499); + svfloat32_t zero642 = svdup_n_f32(0); + svfloat32_t v642 = svcmla_f32_x(pred_full, zero642, v1773, v635, 90); + svfloat32_t v643 = svadd_f32_x(svptrue_b32(), v557, v634); + svfloat32_t v644 = svsub_f32_x(svptrue_b32(), v557, v634); + svfloat32_t v658 = svsub_f32_x(svptrue_b32(), v257, v655); + svfloat32_t v659 = svadd_f32_x(svptrue_b32(), v257, v655); + svfloat32_t v779 = svcmla_f32_x(pred_full, v759, v1565, v471, 90); + svfloat32_t v780 = svcmla_f32_x(pred_full, v771, v1729, v606, 90); + svfloat32_t v913 = svcmla_f32_x(pred_full, v893, v1773, v893, 90); + svfloat32_t v914 = svcmla_f32_x(pred_full, v905, v1649, v905, 90); + svfloat32_t v1047 = svcmla_f32_x(pred_full, v1027, v1729, v472, 90); + svfloat32_t v1048 = svcmla_f32_x(pred_full, v1039, v1731, v607, 90); + svst1_f64(pred_full, (double *)(v1493), svreinterpret_f64_f32(v656)); + svst1_f64(pred_full, (double *)(v1511), svreinterpret_f64_f32(v657)); + svfloat32_t v151 = svsub_f32_x(svptrue_b32(), v64, v148); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v64, v148); + svfloat32_t zero294 = svdup_n_f32(0); + svfloat32_t v294 = svcmla_f32_x(pred_full, zero294, v1773, v287, 90); + svfloat32_t v295 = svadd_f32_x(svptrue_b32(), v149, v286); + svfloat32_t v296 = svsub_f32_x(svptrue_b32(), v149, v286); + svfloat32_t zero333 = svdup_n_f32(0); + svfloat32_t v333 = svcmla_f32_x(pred_full, zero333, v1773, v326, 90); + svfloat32_t v334 = svadd_f32_x(svptrue_b32(), v112, v325); + svfloat32_t v335 = svsub_f32_x(svptrue_b32(), v112, v325); + svfloat32_t zero372 = svdup_n_f32(0); + svfloat32_t v372 = svcmla_f32_x(pred_full, zero372, v1773, v365, 90); + svfloat32_t v510 = svsub_f32_x(svptrue_b32(), v423, v507); + svfloat32_t v511 = svadd_f32_x(svptrue_b32(), v423, v507); + svfloat32_t v645 = svsub_f32_x(svptrue_b32(), v558, v642); + svfloat32_t v646 = svadd_f32_x(svptrue_b32(), v558, v642); + svfloat32_t v692 = svmul_f32_x(svptrue_b32(), v508, v1523); + svfloat32_t v704 = svmul_f32_x(svptrue_b32(), v643, v1605); + svfloat32_t v781 = svadd_f32_x(svptrue_b32(), v779, v780); + svfloat32_t v782 = svsub_f32_x(svptrue_b32(), v780, v779); + svfloat32_t v915 = svadd_f32_x(svptrue_b32(), v913, v914); + svfloat32_t v916 = svsub_f32_x(svptrue_b32(), v914, v913); + svfloat32_t v960 = svmul_f32_x(svptrue_b32(), v509, v1687); + svfloat32_t v972 = svmul_f32_x(svptrue_b32(), v644, v1689); + svfloat32_t v1049 = svadd_f32_x(svptrue_b32(), v1047, v1048); + svfloat32_t v1050 = svsub_f32_x(svptrue_b32(), v1048, v1047); + svst1_f64(pred_full, (double *)(v1502), svreinterpret_f64_f32(v658)); + svst1_f64(pred_full, (double *)(v1520), svreinterpret_f64_f32(v659)); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v150, v294); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v150, v294); + svfloat32_t v336 = svsub_f32_x(svptrue_b32(), v113, v333); + svfloat32_t v337 = svadd_f32_x(svptrue_b32(), v113, v333); + svfloat32_t v373 = svadd_f32_x(svptrue_b32(), v151, v364); + svfloat32_t v374 = svsub_f32_x(svptrue_b32(), v151, v364); + svfloat32_t v375 = svsub_f32_x(svptrue_b32(), v152, v372); + svfloat32_t v376 = svadd_f32_x(svptrue_b32(), v152, v372); + svfloat32_t v712 = svcmla_f32_x(pred_full, v692, v1690, v508, 90); + svfloat32_t v713 = svcmla_f32_x(pred_full, v704, v1606, v643, 90); + svfloat32_t zero789 = svdup_n_f32(0); + svfloat32_t v789 = svcmla_f32_x(pred_full, zero789, v1773, v782, 90); + svfloat32_t v790 = svadd_f32_x(svptrue_b32(), v334, v781); + svfloat32_t v791 = svsub_f32_x(svptrue_b32(), v334, v781); + svfloat32_t v826 = svmul_f32_x(svptrue_b32(), v510, v1605); + svfloat32_t v838 = svmul_f32_x(svptrue_b32(), v645, v1607); + svfloat32_t zero923 = svdup_n_f32(0); + svfloat32_t v923 = svcmla_f32_x(pred_full, zero923, v1773, v916, 90); + svfloat32_t v924 = svadd_f32_x(svptrue_b32(), v258, v915); + svfloat32_t v925 = svsub_f32_x(svptrue_b32(), v258, v915); + svfloat32_t v980 = svcmla_f32_x(pred_full, v960, v1688, v509, 90); + svfloat32_t v981 = svcmla_f32_x(pred_full, v972, v1690, v644, 90); + svfloat32_t zero1057 = svdup_n_f32(0); + svfloat32_t v1057 = svcmla_f32_x(pred_full, zero1057, v1773, v1050, 90); + svfloat32_t v1094 = svmul_f32_x(svptrue_b32(), v511, v1769); + svfloat32_t v1106 = svmul_f32_x(svptrue_b32(), v646, v1771); + svfloat32_t v714 = svadd_f32_x(svptrue_b32(), v712, v713); + svfloat32_t v715 = svsub_f32_x(svptrue_b32(), v713, v712); + svfloat32_t v792 = svsub_f32_x(svptrue_b32(), v335, v789); + svfloat32_t v793 = svadd_f32_x(svptrue_b32(), v335, v789); + svfloat32_t v846 = svcmla_f32_x(pred_full, v826, v1606, v510, 90); + svfloat32_t v847 = svcmla_f32_x(pred_full, v838, v1770, v645, 90); + svfloat32_t v926 = svsub_f32_x(svptrue_b32(), v259, v923); + svfloat32_t v927 = svadd_f32_x(svptrue_b32(), v259, v923); + svfloat32_t v982 = svadd_f32_x(svptrue_b32(), v980, v981); + svfloat32_t v983 = svsub_f32_x(svptrue_b32(), v981, v980); + svfloat32_t v1058 = svadd_f32_x(svptrue_b32(), v336, v1049); + svfloat32_t v1059 = svsub_f32_x(svptrue_b32(), v336, v1049); + svfloat32_t v1060 = svsub_f32_x(svptrue_b32(), v337, v1057); + svfloat32_t v1061 = svadd_f32_x(svptrue_b32(), v337, v1057); + svfloat32_t v1114 = svcmla_f32_x(pred_full, v1094, v1770, v511, 90); + svfloat32_t v1115 = svcmla_f32_x(pred_full, v1106, v1772, v646, 90); + svst1_f64(pred_full, (double *)(v1575), svreinterpret_f64_f32(v790)); + svst1_f64(pred_full, (double *)(v1593), svreinterpret_f64_f32(v791)); + svst1_f64(pred_full, (double *)(v1657), svreinterpret_f64_f32(v924)); + svst1_f64(pred_full, (double *)(v1675), svreinterpret_f64_f32(v925)); + svfloat32_t zero722 = svdup_n_f32(0); + svfloat32_t v722 = svcmla_f32_x(pred_full, zero722, v1773, v715, 90); + svfloat32_t v723 = svadd_f32_x(svptrue_b32(), v295, v714); + svfloat32_t v724 = svsub_f32_x(svptrue_b32(), v295, v714); + svfloat32_t v848 = svadd_f32_x(svptrue_b32(), v846, v847); + svfloat32_t v849 = svsub_f32_x(svptrue_b32(), v847, v846); + svfloat32_t zero990 = svdup_n_f32(0); + svfloat32_t v990 = svcmla_f32_x(pred_full, zero990, v1773, v983, 90); + svfloat32_t v991 = svadd_f32_x(svptrue_b32(), v297, v982); + svfloat32_t v992 = svsub_f32_x(svptrue_b32(), v297, v982); + svfloat32_t v1116 = svadd_f32_x(svptrue_b32(), v1114, v1115); + svfloat32_t v1117 = svsub_f32_x(svptrue_b32(), v1115, v1114); + svst1_f64(pred_full, (double *)(v1584), svreinterpret_f64_f32(v792)); + svst1_f64(pred_full, (double *)(v1602), svreinterpret_f64_f32(v793)); + svst1_f64(pred_full, (double *)(v1666), svreinterpret_f64_f32(v926)); + svst1_f64(pred_full, (double *)(v1684), svreinterpret_f64_f32(v927)); + svst1_f64(pred_full, (double *)(v1739), svreinterpret_f64_f32(v1058)); + svst1_f64(pred_full, (double *)(v1748), svreinterpret_f64_f32(v1060)); + svst1_f64(pred_full, (double *)(v1757), svreinterpret_f64_f32(v1059)); + svst1_f64(pred_full, (double *)(v1766), svreinterpret_f64_f32(v1061)); + svfloat32_t v725 = svsub_f32_x(svptrue_b32(), v296, v722); + svfloat32_t v726 = svadd_f32_x(svptrue_b32(), v296, v722); + svfloat32_t zero856 = svdup_n_f32(0); + svfloat32_t v856 = svcmla_f32_x(pred_full, zero856, v1773, v849, 90); + svfloat32_t v857 = svadd_f32_x(svptrue_b32(), v373, v848); + svfloat32_t v858 = svsub_f32_x(svptrue_b32(), v373, v848); + svfloat32_t v993 = svsub_f32_x(svptrue_b32(), v298, v990); + svfloat32_t v994 = svadd_f32_x(svptrue_b32(), v298, v990); + svfloat32_t zero1124 = svdup_n_f32(0); + svfloat32_t v1124 = svcmla_f32_x(pred_full, zero1124, v1773, v1117, 90); + svfloat32_t v1125 = svadd_f32_x(svptrue_b32(), v375, v1116); + svfloat32_t v1126 = svsub_f32_x(svptrue_b32(), v375, v1116); + svst1_f64(pred_full, (double *)(v1534), svreinterpret_f64_f32(v723)); + svst1_f64(pred_full, (double *)(v1552), svreinterpret_f64_f32(v724)); + svst1_f64(pred_full, (double *)(v1698), svreinterpret_f64_f32(v991)); + svst1_f64(pred_full, (double *)(v1716), svreinterpret_f64_f32(v992)); + svfloat32_t v859 = svsub_f32_x(svptrue_b32(), v374, v856); + svfloat32_t v860 = svadd_f32_x(svptrue_b32(), v374, v856); + svfloat32_t v1127 = svsub_f32_x(svptrue_b32(), v376, v1124); + svfloat32_t v1128 = svadd_f32_x(svptrue_b32(), v376, v1124); + svst1_f64(pred_full, (double *)(v1543), svreinterpret_f64_f32(v725)); + svst1_f64(pred_full, (double *)(v1561), svreinterpret_f64_f32(v726)); + svst1_f64(pred_full, (double *)(v1616), svreinterpret_f64_f32(v857)); + svst1_f64(pred_full, (double *)(v1634), svreinterpret_f64_f32(v858)); + svst1_f64(pred_full, (double *)(v1707), svreinterpret_f64_f32(v993)); + svst1_f64(pred_full, (double *)(v1725), svreinterpret_f64_f32(v994)); + svst1_f64(pred_full, (double *)(v1780), svreinterpret_f64_f32(v1125)); + svst1_f64(pred_full, (double *)(v1798), svreinterpret_f64_f32(v1126)); + svst1_f64(pred_full, (double *)(v1625), svreinterpret_f64_f32(v859)); + svst1_f64(pred_full, (double *)(v1643), svreinterpret_f64_f32(v860)); + svst1_f64(pred_full, (double *)(v1789), svreinterpret_f64_f32(v1127)); + svst1_f64(pred_full, (double *)(v1807), svreinterpret_f64_f32(v1128)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu36(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v244 = vld1s_s16(&v5[istride]); + float v533 = 8.6602540378443871e-01F; + float v552 = 6.4278760968653925e-01F; + float v559 = -3.4202014332566888e-01F; + float v566 = 9.8480775301220802e-01F; + float v618 = 1.0000000000000000e+00F; + float v619 = -1.0000000000000000e+00F; + float v625 = -5.0000000000000000e-01F; + float v626 = 5.0000000000000000e-01F; + float v636 = -1.4999999999999998e+00F; + float v637 = 1.4999999999999998e+00F; + float v644 = -8.6602540378443871e-01F; + float v647 = 7.6604444311897801e-01F; + float v648 = -7.6604444311897801e-01F; + float v654 = 9.3969262078590832e-01F; + float v655 = -9.3969262078590832e-01F; + float v661 = -1.7364817766693039e-01F; + float v662 = 1.7364817766693039e-01F; + float32x2_t v664 = (float32x2_t){v4, v4}; + float v669 = -6.4278760968653925e-01F; + float v673 = 3.4202014332566888e-01F; + float v677 = -9.8480775301220802e-01F; + int16x4_t v20 = vld1s_s16(&v5[0]); + float32x2_t v245 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v244)), 15); + float32x2_t v520 = (float32x2_t){v625, v625}; + float32x2_t v531 = (float32x2_t){v636, v636}; + float32x2_t v535 = (float32x2_t){v533, v644}; + float32x2_t v542 = (float32x2_t){v647, v647}; + float32x2_t v546 = (float32x2_t){v654, v654}; + float32x2_t v550 = (float32x2_t){v661, v661}; + float32x2_t v554 = (float32x2_t){v552, v669}; + float32x2_t v561 = (float32x2_t){v559, v673}; + float32x2_t v568 = (float32x2_t){v566, v677}; + float32x2_t v620 = (float32x2_t){v618, v619}; + float32x2_t v627 = (float32x2_t){v625, v626}; + float32x2_t v638 = (float32x2_t){v636, v637}; + float32x2_t v645 = (float32x2_t){v644, v644}; + float32x2_t v649 = (float32x2_t){v647, v648}; + float32x2_t v656 = (float32x2_t){v654, v655}; + float32x2_t v663 = (float32x2_t){v661, v662}; + float32x2_t v670 = (float32x2_t){v669, v669}; + float32x2_t v674 = (float32x2_t){v673, v673}; + float32x2_t v678 = (float32x2_t){v677, v677}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v26 = vld1s_s16(&v5[istride * 18]); + int16x4_t v34 = vld1s_s16(&v5[istride * 9]); + int16x4_t v40 = vld1s_s16(&v5[istride * 27]); + int16x4_t v50 = vld1s_s16(&v5[istride * 4]); + int16x4_t v56 = vld1s_s16(&v5[istride * 22]); + int16x4_t v64 = vld1s_s16(&v5[istride * 13]); + int16x4_t v70 = vld1s_s16(&v5[istride * 31]); + int16x4_t v80 = vld1s_s16(&v5[istride * 8]); + int16x4_t v86 = vld1s_s16(&v5[istride * 26]); + int16x4_t v94 = vld1s_s16(&v5[istride * 17]); + int16x4_t v100 = vld1s_s16(&v5[istride * 35]); + int16x4_t v110 = vld1s_s16(&v5[istride * 12]); + int16x4_t v116 = vld1s_s16(&v5[istride * 30]); + int16x4_t v124 = vld1s_s16(&v5[istride * 21]); + int16x4_t v130 = vld1s_s16(&v5[istride * 3]); + int16x4_t v140 = vld1s_s16(&v5[istride * 16]); + int16x4_t v146 = vld1s_s16(&v5[istride * 34]); + int16x4_t v154 = vld1s_s16(&v5[istride * 25]); + int16x4_t v160 = vld1s_s16(&v5[istride * 7]); + int16x4_t v170 = vld1s_s16(&v5[istride * 20]); + int16x4_t v176 = vld1s_s16(&v5[istride * 2]); + int16x4_t v184 = vld1s_s16(&v5[istride * 29]); + int16x4_t v190 = vld1s_s16(&v5[istride * 11]); + int16x4_t v200 = vld1s_s16(&v5[istride * 24]); + int16x4_t v206 = vld1s_s16(&v5[istride * 6]); + int16x4_t v214 = vld1s_s16(&v5[istride * 33]); + int16x4_t v220 = vld1s_s16(&v5[istride * 15]); + int16x4_t v230 = vld1s_s16(&v5[istride * 28]); + int16x4_t v236 = vld1s_s16(&v5[istride * 10]); + int16x4_t v250 = vld1s_s16(&v5[istride * 19]); + int16x4_t v260 = vld1s_s16(&v5[istride * 32]); + int16x4_t v266 = vld1s_s16(&v5[istride * 14]); + int16x4_t v274 = vld1s_s16(&v5[istride * 5]); + int16x4_t v280 = vld1s_s16(&v5[istride * 23]); + float32x2_t v537 = vmul_f32(v664, v535); + float32x2_t v556 = vmul_f32(v664, v554); + float32x2_t v563 = vmul_f32(v664, v561); + float32x2_t v570 = vmul_f32(v664, v568); + float32x2_t v622 = vmul_f32(v664, v620); + float32x2_t v629 = vmul_f32(v664, v627); + float32x2_t v640 = vmul_f32(v664, v638); + float32x2_t v651 = vmul_f32(v664, v649); + float32x2_t v658 = vmul_f32(v664, v656); + float32x2_t v665 = vmul_f32(v664, v663); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v51 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v50)), 15); + float32x2_t v57 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v56)), 15); + float32x2_t v65 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v64)), 15); + float32x2_t v71 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v70)), 15); + float32x2_t v81 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v80)), 15); + float32x2_t v87 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v86)), 15); + float32x2_t v95 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v94)), 15); + float32x2_t v101 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v100)), 15); + float32x2_t v111 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v110)), 15); + float32x2_t v117 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v116)), 15); + float32x2_t v125 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v124)), 15); + float32x2_t v131 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v130)), 15); + float32x2_t v141 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v140)), 15); + float32x2_t v147 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v146)), 15); + float32x2_t v155 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v154)), 15); + float32x2_t v161 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v160)), 15); + float32x2_t v171 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v170)), 15); + float32x2_t v177 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v176)), 15); + float32x2_t v185 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v184)), 15); + float32x2_t v191 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v190)), 15); + float32x2_t v201 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v200)), 15); + float32x2_t v207 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v206)), 15); + float32x2_t v215 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v214)), 15); + float32x2_t v221 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v220)), 15); + float32x2_t v231 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v230)), 15); + float32x2_t v237 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v236)), 15); + float32x2_t v251 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v250)), 15); + float32x2_t v261 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v260)), 15); + float32x2_t v267 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v266)), 15); + float32x2_t v275 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v274)), 15); + float32x2_t v281 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v280)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v58 = vadd_f32(v51, v57); + float32x2_t v59 = vsub_f32(v51, v57); + float32x2_t v72 = vadd_f32(v65, v71); + float32x2_t v73 = vsub_f32(v65, v71); + float32x2_t v88 = vadd_f32(v81, v87); + float32x2_t v89 = vsub_f32(v81, v87); + float32x2_t v102 = vadd_f32(v95, v101); + float32x2_t v103 = vsub_f32(v95, v101); + float32x2_t v118 = vadd_f32(v111, v117); + float32x2_t v119 = vsub_f32(v111, v117); + float32x2_t v132 = vadd_f32(v125, v131); + float32x2_t v133 = vsub_f32(v125, v131); + float32x2_t v148 = vadd_f32(v141, v147); + float32x2_t v149 = vsub_f32(v141, v147); + float32x2_t v162 = vadd_f32(v155, v161); + float32x2_t v163 = vsub_f32(v155, v161); + float32x2_t v178 = vadd_f32(v171, v177); + float32x2_t v179 = vsub_f32(v171, v177); + float32x2_t v192 = vadd_f32(v185, v191); + float32x2_t v193 = vsub_f32(v185, v191); + float32x2_t v208 = vadd_f32(v201, v207); + float32x2_t v209 = vsub_f32(v201, v207); + float32x2_t v222 = vadd_f32(v215, v221); + float32x2_t v223 = vsub_f32(v215, v221); + float32x2_t v238 = vadd_f32(v231, v237); + float32x2_t v239 = vsub_f32(v231, v237); + float32x2_t v252 = vadd_f32(v245, v251); + float32x2_t v253 = vsub_f32(v245, v251); + float32x2_t v268 = vadd_f32(v261, v267); + float32x2_t v269 = vsub_f32(v261, v267); + float32x2_t v282 = vadd_f32(v275, v281); + float32x2_t v283 = vsub_f32(v275, v281); + float32x2_t v44 = vadd_f32(v28, v42); + float32x2_t v45 = vsub_f32(v28, v42); + float32x2_t v74 = vadd_f32(v58, v72); + float32x2_t v75 = vsub_f32(v58, v72); + float32x2_t v104 = vadd_f32(v88, v102); + float32x2_t v105 = vsub_f32(v88, v102); + float32x2_t v134 = vadd_f32(v118, v132); + float32x2_t v135 = vsub_f32(v118, v132); + float32x2_t v164 = vadd_f32(v148, v162); + float32x2_t v165 = vsub_f32(v148, v162); + float32x2_t v194 = vadd_f32(v178, v192); + float32x2_t v195 = vsub_f32(v178, v192); + float32x2_t v224 = vadd_f32(v208, v222); + float32x2_t v225 = vsub_f32(v208, v222); + float32x2_t v254 = vadd_f32(v238, v252); + float32x2_t v255 = vsub_f32(v238, v252); + float32x2_t v284 = vadd_f32(v268, v282); + float32x2_t v285 = vsub_f32(v268, v282); + float32x2_t v494 = vadd_f32(v59, v269); + float32x2_t v495 = vsub_f32(v59, v269); + float32x2_t v496 = vadd_f32(v239, v89); + float32x2_t v497 = vsub_f32(v239, v89); + float32x2_t v498 = vadd_f32(v119, v209); + float32x2_t v499 = vsub_f32(v119, v209); + float32x2_t v500 = vadd_f32(v149, v179); + float32x2_t v501 = vsub_f32(v149, v179); + float32x2_t v598 = vadd_f32(v73, v283); + float32x2_t v599 = vsub_f32(v73, v283); + float32x2_t v600 = vadd_f32(v253, v103); + float32x2_t v601 = vsub_f32(v253, v103); + float32x2_t v602 = vadd_f32(v133, v223); + float32x2_t v603 = vsub_f32(v133, v223); + float32x2_t v604 = vadd_f32(v163, v193); + float32x2_t v605 = vsub_f32(v163, v193); + float32x2_t v286 = vadd_f32(v74, v284); + float32x2_t v287 = vsub_f32(v74, v284); + float32x2_t v288 = vadd_f32(v254, v104); + float32x2_t v289 = vsub_f32(v254, v104); + float32x2_t v290 = vadd_f32(v134, v224); + float32x2_t v291 = vsub_f32(v134, v224); + float32x2_t v292 = vadd_f32(v164, v194); + float32x2_t v293 = vsub_f32(v164, v194); + float32x2_t v390 = vadd_f32(v75, v285); + float32x2_t v391 = vsub_f32(v75, v285); + float32x2_t v392 = vadd_f32(v255, v105); + float32x2_t v393 = vsub_f32(v255, v105); + float32x2_t v394 = vadd_f32(v135, v225); + float32x2_t v395 = vsub_f32(v135, v225); + float32x2_t v396 = vadd_f32(v165, v195); + float32x2_t v397 = vsub_f32(v165, v195); + float32x2_t v502 = vadd_f32(v494, v496); + float32x2_t v506 = vadd_f32(v495, v497); + float32x2_t v508 = vsub_f32(v494, v496); + float32x2_t v509 = vsub_f32(v496, v500); + float32x2_t v510 = vsub_f32(v500, v494); + float32x2_t v511 = vsub_f32(v495, v497); + float32x2_t v512 = vsub_f32(v497, v501); + float32x2_t v513 = vsub_f32(v501, v495); + float32x2_t v532 = vmul_f32(v498, v531); + float32x2_t v538 = vrev64_f32(v499); + float32x2_t v606 = vadd_f32(v598, v600); + float32x2_t v610 = vadd_f32(v599, v601); + float32x2_t v612 = vsub_f32(v598, v600); + float32x2_t v613 = vsub_f32(v600, v604); + float32x2_t v614 = vsub_f32(v604, v598); + float32x2_t v615 = vsub_f32(v599, v601); + float32x2_t v616 = vsub_f32(v601, v605); + float32x2_t v617 = vsub_f32(v605, v599); + float32x2_t v641 = vrev64_f32(v602); + float32x2_t v646 = vmul_f32(v603, v645); + float32x2_t v294 = vadd_f32(v286, v288); + float32x2_t v298 = vadd_f32(v287, v289); + float32x2_t v300 = vsub_f32(v286, v288); + float32x2_t v301 = vsub_f32(v288, v292); + float32x2_t v302 = vsub_f32(v292, v286); + float32x2_t v303 = vsub_f32(v287, v289); + float32x2_t v304 = vsub_f32(v289, v293); + float32x2_t v305 = vsub_f32(v293, v287); + float32x2_t v324 = vmul_f32(v290, v531); + float32x2_t v330 = vrev64_f32(v291); + float32x2_t v398 = vadd_f32(v390, v392); + float32x2_t v402 = vadd_f32(v391, v393); + float32x2_t v404 = vsub_f32(v390, v392); + float32x2_t v405 = vsub_f32(v392, v396); + float32x2_t v406 = vsub_f32(v396, v390); + float32x2_t v407 = vsub_f32(v391, v393); + float32x2_t v408 = vsub_f32(v393, v397); + float32x2_t v409 = vsub_f32(v397, v391); + float32x2_t v428 = vmul_f32(v394, v531); + float32x2_t v434 = vrev64_f32(v395); + float32x2_t v503 = vadd_f32(v502, v500); + float32x2_t v507 = vadd_f32(v506, v501); + float32x2_t v539 = vmul_f32(v538, v537); + float32x2_t v543 = vmul_f32(v508, v542); + float32x2_t v547 = vmul_f32(v509, v546); + float32x2_t v551 = vmul_f32(v510, v550); + float32x2_t v557 = vrev64_f32(v511); + float32x2_t v564 = vrev64_f32(v512); + float32x2_t v571 = vrev64_f32(v513); + float32x2_t v607 = vadd_f32(v606, v604); + float32x2_t v611 = vadd_f32(v610, v605); + float32x2_t v642 = vmul_f32(v641, v640); + float32x2_t v652 = vrev64_f32(v612); + float32x2_t v659 = vrev64_f32(v613); + float32x2_t v666 = vrev64_f32(v614); + float32x2_t v671 = vmul_f32(v615, v670); + float32x2_t v675 = vmul_f32(v616, v674); + float32x2_t v679 = vmul_f32(v617, v678); + float32x2_t v295 = vadd_f32(v294, v292); + float32x2_t v299 = vadd_f32(v298, v293); + float32x2_t v331 = vmul_f32(v330, v537); + float32x2_t v335 = vmul_f32(v300, v542); + float32x2_t v339 = vmul_f32(v301, v546); + float32x2_t v343 = vmul_f32(v302, v550); + float32x2_t v349 = vrev64_f32(v303); + float32x2_t v356 = vrev64_f32(v304); + float32x2_t v363 = vrev64_f32(v305); + float32x2_t v399 = vadd_f32(v398, v396); + float32x2_t v403 = vadd_f32(v402, v397); + float32x2_t v435 = vmul_f32(v434, v537); + float32x2_t v439 = vmul_f32(v404, v542); + float32x2_t v443 = vmul_f32(v405, v546); + float32x2_t v447 = vmul_f32(v406, v550); + float32x2_t v453 = vrev64_f32(v407); + float32x2_t v460 = vrev64_f32(v408); + float32x2_t v467 = vrev64_f32(v409); + float32x2_t v504 = vadd_f32(v503, v498); + float32x2_t v521 = vmul_f32(v503, v520); + float32x2_t v527 = vrev64_f32(v507); + float32x2_t v558 = vmul_f32(v557, v556); + float32x2_t v565 = vmul_f32(v564, v563); + float32x2_t v572 = vmul_f32(v571, v570); + float32x2_t v608 = vadd_f32(v607, v602); + float32x2_t v630 = vrev64_f32(v607); + float32x2_t v635 = vmul_f32(v611, v645); + float32x2_t v653 = vmul_f32(v652, v651); + float32x2_t v660 = vmul_f32(v659, v658); + float32x2_t v667 = vmul_f32(v666, v665); + float32x2_t v693 = vadd_f32(v646, v671); + float32x2_t v695 = vsub_f32(v646, v675); + float32x2_t v697 = vsub_f32(v646, v671); + float32x2_t v296 = vadd_f32(v295, v290); + float32x2_t v313 = vmul_f32(v295, v520); + float32x2_t v319 = vrev64_f32(v299); + float32x2_t v350 = vmul_f32(v349, v556); + float32x2_t v357 = vmul_f32(v356, v563); + float32x2_t v364 = vmul_f32(v363, v570); + float32x2_t v400 = vadd_f32(v399, v394); + float32x2_t v417 = vmul_f32(v399, v520); + float32x2_t v423 = vrev64_f32(v403); + float32x2_t v454 = vmul_f32(v453, v556); + float32x2_t v461 = vmul_f32(v460, v563); + float32x2_t v468 = vmul_f32(v467, v570); + float32x2_t v505 = vadd_f32(v504, v29); + float32x2_t v528 = vmul_f32(v527, v537); + float32x2_t v573 = vadd_f32(v521, v521); + float32x2_t v586 = vadd_f32(v539, v558); + float32x2_t v588 = vsub_f32(v539, v565); + float32x2_t v590 = vsub_f32(v539, v558); + float32x2_t v609 = vadd_f32(v608, v43); + float32x2_t v631 = vmul_f32(v630, v629); + float32x2_t v694 = vadd_f32(v693, v675); + float32x2_t v696 = vadd_f32(v695, v679); + float32x2_t v698 = vsub_f32(v697, v679); + float32x2_t v297 = vadd_f32(v296, v44); + float32x2_t v320 = vmul_f32(v319, v537); + float32x2_t v365 = vadd_f32(v313, v313); + float32x2_t v378 = vadd_f32(v331, v350); + float32x2_t v380 = vsub_f32(v331, v357); + float32x2_t v382 = vsub_f32(v331, v350); + float32x2_t v401 = vadd_f32(v400, v45); + float32x2_t v424 = vmul_f32(v423, v537); + float32x2_t v469 = vadd_f32(v417, v417); + float32x2_t v482 = vadd_f32(v435, v454); + float32x2_t v484 = vsub_f32(v435, v461); + float32x2_t v486 = vsub_f32(v435, v454); + float32x2_t v574 = vadd_f32(v573, v521); + float32x2_t v578 = vadd_f32(v505, v532); + float32x2_t v587 = vadd_f32(v586, v565); + float32x2_t v589 = vadd_f32(v588, v572); + float32x2_t v591 = vsub_f32(v590, v572); + float32x2_t v623 = vrev64_f32(v609); + float32x2_t v680 = vadd_f32(v631, v631); + float32x2_t v366 = vadd_f32(v365, v313); + float32x2_t v370 = vadd_f32(v297, v324); + float32x2_t v379 = vadd_f32(v378, v357); + float32x2_t v381 = vadd_f32(v380, v364); + float32x2_t v383 = vsub_f32(v382, v364); + float32x2_t v470 = vadd_f32(v469, v417); + float32x2_t v474 = vadd_f32(v401, v428); + float32x2_t v483 = vadd_f32(v482, v461); + float32x2_t v485 = vadd_f32(v484, v468); + float32x2_t v487 = vsub_f32(v486, v468); + float32x2_t v575 = vadd_f32(v505, v574); + float32x2_t v579 = vadd_f32(v578, v573); + float32x2_t v624 = vmul_f32(v623, v622); + float32x2_t v681 = vadd_f32(v680, v631); + v6[0] = v297; + v6[ostride * 18] = v401; + float32x2_t v367 = vadd_f32(v297, v366); + float32x2_t v371 = vadd_f32(v370, v365); + float32x2_t v471 = vadd_f32(v401, v470); + float32x2_t v475 = vadd_f32(v474, v469); + float32x2_t v576 = vadd_f32(v575, v528); + float32x2_t v577 = vsub_f32(v575, v528); + float32x2_t v580 = vadd_f32(v579, v543); + float32x2_t v582 = vsub_f32(v579, v547); + float32x2_t v584 = vsub_f32(v579, v543); + float32x2_t v682 = vadd_f32(v624, v681); + float32x2_t v685 = vadd_f32(v624, v642); + float32x2_t v705 = vadd_f32(v505, v624); + float32x2_t v706 = vsub_f32(v505, v624); + float32x2_t v368 = vadd_f32(v367, v320); + float32x2_t v369 = vsub_f32(v367, v320); + float32x2_t v372 = vadd_f32(v371, v335); + float32x2_t v374 = vsub_f32(v371, v339); + float32x2_t v376 = vsub_f32(v371, v335); + float32x2_t v472 = vadd_f32(v471, v424); + float32x2_t v473 = vsub_f32(v471, v424); + float32x2_t v476 = vadd_f32(v475, v439); + float32x2_t v478 = vsub_f32(v475, v443); + float32x2_t v480 = vsub_f32(v475, v439); + float32x2_t v581 = vadd_f32(v580, v547); + float32x2_t v583 = vadd_f32(v582, v551); + float32x2_t v585 = vsub_f32(v584, v551); + float32x2_t v683 = vadd_f32(v682, v635); + float32x2_t v684 = vsub_f32(v682, v635); + float32x2_t v686 = vadd_f32(v685, v680); + v6[ostride * 9] = v706; + v6[ostride * 27] = v705; + float32x2_t v373 = vadd_f32(v372, v339); + float32x2_t v375 = vadd_f32(v374, v343); + float32x2_t v377 = vsub_f32(v376, v343); + float32x2_t v477 = vadd_f32(v476, v443); + float32x2_t v479 = vadd_f32(v478, v447); + float32x2_t v481 = vsub_f32(v480, v447); + float32x2_t v592 = vadd_f32(v581, v587); + float32x2_t v593 = vsub_f32(v581, v587); + float32x2_t v594 = vadd_f32(v583, v589); + float32x2_t v595 = vsub_f32(v583, v589); + float32x2_t v596 = vadd_f32(v585, v591); + float32x2_t v597 = vsub_f32(v585, v591); + float32x2_t v687 = vadd_f32(v686, v653); + float32x2_t v689 = vsub_f32(v686, v660); + float32x2_t v691 = vsub_f32(v686, v653); + float32x2_t v771 = vadd_f32(v577, v684); + float32x2_t v772 = vsub_f32(v577, v684); + v6[ostride * 12] = v369; + v6[ostride * 30] = v473; + float32x2_t v837 = vadd_f32(v576, v683); + float32x2_t v838 = vsub_f32(v576, v683); + v6[ostride * 24] = v368; + v6[ostride * 6] = v472; + float32x2_t v384 = vadd_f32(v373, v379); + float32x2_t v385 = vsub_f32(v373, v379); + float32x2_t v386 = vadd_f32(v375, v381); + float32x2_t v387 = vsub_f32(v375, v381); + float32x2_t v388 = vadd_f32(v377, v383); + float32x2_t v389 = vsub_f32(v377, v383); + float32x2_t v488 = vadd_f32(v477, v483); + float32x2_t v489 = vsub_f32(v477, v483); + float32x2_t v490 = vadd_f32(v479, v485); + float32x2_t v491 = vsub_f32(v479, v485); + float32x2_t v492 = vadd_f32(v481, v487); + float32x2_t v493 = vsub_f32(v481, v487); + float32x2_t v688 = vadd_f32(v687, v660); + float32x2_t v690 = vadd_f32(v689, v667); + float32x2_t v692 = vsub_f32(v691, v667); + v6[ostride * 21] = v772; + v6[ostride * 3] = v771; + v6[ostride * 33] = v838; + v6[ostride * 15] = v837; + float32x2_t v699 = vadd_f32(v688, v694); + float32x2_t v700 = vsub_f32(v688, v694); + float32x2_t v701 = vadd_f32(v690, v696); + float32x2_t v702 = vsub_f32(v690, v696); + float32x2_t v703 = vadd_f32(v692, v698); + float32x2_t v704 = vsub_f32(v692, v698); + v6[ostride * 28] = v385; + v6[ostride * 10] = v489; + v6[ostride * 20] = v386; + v6[ostride * 2] = v490; + v6[ostride * 4] = v389; + v6[ostride * 22] = v493; + v6[ostride * 32] = v388; + v6[ostride * 14] = v492; + v6[ostride * 16] = v387; + v6[ostride * 34] = v491; + v6[ostride * 8] = v384; + v6[ostride * 26] = v488; + float32x2_t v727 = vadd_f32(v593, v700); + float32x2_t v728 = vsub_f32(v593, v700); + float32x2_t v749 = vadd_f32(v594, v701); + float32x2_t v750 = vsub_f32(v594, v701); + float32x2_t v793 = vadd_f32(v597, v704); + float32x2_t v794 = vsub_f32(v597, v704); + float32x2_t v815 = vadd_f32(v596, v703); + float32x2_t v816 = vsub_f32(v596, v703); + float32x2_t v859 = vadd_f32(v595, v702); + float32x2_t v860 = vsub_f32(v595, v702); + float32x2_t v881 = vadd_f32(v592, v699); + float32x2_t v882 = vsub_f32(v592, v699); + v6[ostride] = v728; + v6[ostride * 19] = v727; + v6[ostride * 29] = v750; + v6[ostride * 11] = v749; + v6[ostride * 13] = v794; + v6[ostride * 31] = v793; + v6[ostride * 5] = v816; + v6[ostride * 23] = v815; + v6[ostride * 25] = v860; + v6[ostride * 7] = v859; + v6[ostride * 17] = v882; + v6[ostride * 35] = v881; + v5 += 1 * idist; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu36(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v606 = -5.0000000000000000e-01F; + float v618 = -1.4999999999999998e+00F; + float v630 = 7.6604444311897801e-01F; + float v635 = 9.3969262078590832e-01F; + float v640 = -1.7364817766693039e-01F; + float v711 = -1.0000000000000000e+00F; + float v718 = 5.0000000000000000e-01F; + float v730 = 1.4999999999999998e+00F; + float v737 = -8.6602540378443871e-01F; + float v742 = -7.6604444311897801e-01F; + float v749 = -9.3969262078590832e-01F; + float v756 = 1.7364817766693039e-01F; + float v763 = -6.4278760968653925e-01F; + float v768 = 3.4202014332566888e-01F; + float v773 = -9.8480775301220802e-01F; + const int32_t *v1349 = &v5[v0]; + float32x2_t *v1493 = &v6[v2]; + int64_t v27 = v0 * 18; + int64_t v37 = v0 * 9; + int64_t v45 = v0 * 27; + int64_t v57 = v0 * 4; + int64_t v65 = v0 * 22; + int64_t v75 = v0 * 13; + int64_t v83 = v0 * 31; + int64_t v95 = v0 * 8; + int64_t v103 = v0 * 26; + int64_t v113 = v0 * 17; + int64_t v121 = v0 * 35; + int64_t v133 = v0 * 12; + int64_t v141 = v0 * 30; + int64_t v151 = v0 * 21; + int64_t v159 = v0 * 3; + int64_t v171 = v0 * 16; + int64_t v179 = v0 * 34; + int64_t v189 = v0 * 25; + int64_t v197 = v0 * 7; + int64_t v209 = v0 * 20; + int64_t v217 = v0 * 2; + int64_t v227 = v0 * 29; + int64_t v235 = v0 * 11; + int64_t v247 = v0 * 24; + int64_t v255 = v0 * 6; + int64_t v265 = v0 * 33; + int64_t v273 = v0 * 15; + int64_t v285 = v0 * 28; + int64_t v293 = v0 * 10; + int64_t v311 = v0 * 19; + int64_t v323 = v0 * 32; + int64_t v331 = v0 * 14; + int64_t v341 = v0 * 5; + int64_t v349 = v0 * 23; + float v626 = v4 * v737; + float v648 = v4 * v763; + float v655 = v4 * v768; + float v662 = v4 * v773; + float v714 = v4 * v711; + float v721 = v4 * v718; + float v733 = v4 * v730; + float v745 = v4 * v742; + float v752 = v4 * v749; + float v759 = v4 * v756; + int64_t v812 = v2 * 9; + int64_t v819 = v2 * 18; + int64_t v826 = v2 * 27; + int64_t v835 = v2 * 28; + int64_t v849 = v2 * 10; + int64_t v856 = v2 * 19; + int64_t v865 = v2 * 20; + int64_t v872 = v2 * 29; + int64_t v879 = v2 * 2; + int64_t v886 = v2 * 11; + int64_t v895 = v2 * 12; + int64_t v902 = v2 * 21; + int64_t v909 = v2 * 30; + int64_t v916 = v2 * 3; + int64_t v925 = v2 * 4; + int64_t v932 = v2 * 13; + int64_t v939 = v2 * 22; + int64_t v946 = v2 * 31; + int64_t v955 = v2 * 32; + int64_t v962 = v2 * 5; + int64_t v969 = v2 * 14; + int64_t v976 = v2 * 23; + int64_t v985 = v2 * 24; + int64_t v992 = v2 * 33; + int64_t v999 = v2 * 6; + int64_t v1006 = v2 * 15; + int64_t v1015 = v2 * 16; + int64_t v1022 = v2 * 25; + int64_t v1029 = v2 * 34; + int64_t v1036 = v2 * 7; + int64_t v1045 = v2 * 8; + int64_t v1052 = v2 * 17; + int64_t v1059 = v2 * 26; + int64_t v1066 = v2 * 35; + const int32_t *v1079 = &v5[0]; + svint64_t v1395 = svindex_s64(0, v1); + svfloat32_t v1420 = svdup_n_f32(v606); + svfloat32_t v1422 = svdup_n_f32(v618); + svfloat32_t v1424 = svdup_n_f32(v630); + svfloat32_t v1425 = svdup_n_f32(v635); + svfloat32_t v1426 = svdup_n_f32(v640); + svfloat32_t v1434 = svdup_n_f32(v737); + svfloat32_t v1438 = svdup_n_f32(v763); + svfloat32_t v1439 = svdup_n_f32(v768); + svfloat32_t v1440 = svdup_n_f32(v773); + float32x2_t *v1448 = &v6[0]; + svint16_t v1081 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1079), v1395)); + const int32_t *v1088 = &v5[v27]; + const int32_t *v1097 = &v5[v37]; + const int32_t *v1106 = &v5[v45]; + const int32_t *v1115 = &v5[v57]; + const int32_t *v1124 = &v5[v65]; + const int32_t *v1133 = &v5[v75]; + const int32_t *v1142 = &v5[v83]; + const int32_t *v1151 = &v5[v95]; + const int32_t *v1160 = &v5[v103]; + const int32_t *v1169 = &v5[v113]; + const int32_t *v1178 = &v5[v121]; + const int32_t *v1187 = &v5[v133]; + const int32_t *v1196 = &v5[v141]; + const int32_t *v1205 = &v5[v151]; + const int32_t *v1214 = &v5[v159]; + const int32_t *v1223 = &v5[v171]; + const int32_t *v1232 = &v5[v179]; + const int32_t *v1241 = &v5[v189]; + const int32_t *v1250 = &v5[v197]; + const int32_t *v1259 = &v5[v209]; + const int32_t *v1268 = &v5[v217]; + const int32_t *v1277 = &v5[v227]; + const int32_t *v1286 = &v5[v235]; + const int32_t *v1295 = &v5[v247]; + const int32_t *v1304 = &v5[v255]; + const int32_t *v1313 = &v5[v265]; + const int32_t *v1322 = &v5[v273]; + const int32_t *v1331 = &v5[v285]; + const int32_t *v1340 = &v5[v293]; + svint16_t v1351 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1349), v1395)); + const int32_t *v1358 = &v5[v311]; + const int32_t *v1367 = &v5[v323]; + const int32_t *v1376 = &v5[v331]; + const int32_t *v1385 = &v5[v341]; + const int32_t *v1394 = &v5[v349]; + svfloat32_t v1423 = svdup_n_f32(v626); + svfloat32_t v1427 = svdup_n_f32(v648); + svfloat32_t v1428 = svdup_n_f32(v655); + svfloat32_t v1429 = svdup_n_f32(v662); + svfloat32_t v1430 = svdup_n_f32(v714); + svfloat32_t v1431 = svdup_n_f32(v721); + svfloat32_t v1433 = svdup_n_f32(v733); + svfloat32_t v1435 = svdup_n_f32(v745); + svfloat32_t v1436 = svdup_n_f32(v752); + svfloat32_t v1437 = svdup_n_f32(v759); + float32x2_t *v1457 = &v6[v812]; + float32x2_t *v1466 = &v6[v819]; + float32x2_t *v1475 = &v6[v826]; + float32x2_t *v1484 = &v6[v835]; + float32x2_t *v1502 = &v6[v849]; + float32x2_t *v1511 = &v6[v856]; + float32x2_t *v1520 = &v6[v865]; + float32x2_t *v1529 = &v6[v872]; + float32x2_t *v1538 = &v6[v879]; + float32x2_t *v1547 = &v6[v886]; + float32x2_t *v1556 = &v6[v895]; + float32x2_t *v1565 = &v6[v902]; + float32x2_t *v1574 = &v6[v909]; + float32x2_t *v1583 = &v6[v916]; + float32x2_t *v1592 = &v6[v925]; + float32x2_t *v1601 = &v6[v932]; + float32x2_t *v1610 = &v6[v939]; + float32x2_t *v1619 = &v6[v946]; + float32x2_t *v1628 = &v6[v955]; + float32x2_t *v1637 = &v6[v962]; + float32x2_t *v1646 = &v6[v969]; + float32x2_t *v1655 = &v6[v976]; + float32x2_t *v1664 = &v6[v985]; + float32x2_t *v1673 = &v6[v992]; + float32x2_t *v1682 = &v6[v999]; + float32x2_t *v1691 = &v6[v1006]; + float32x2_t *v1700 = &v6[v1015]; + float32x2_t *v1709 = &v6[v1022]; + float32x2_t *v1718 = &v6[v1029]; + float32x2_t *v1727 = &v6[v1036]; + float32x2_t *v1736 = &v6[v1045]; + float32x2_t *v1745 = &v6[v1052]; + float32x2_t *v1754 = &v6[v1059]; + float32x2_t *v1763 = &v6[v1066]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1081, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v309 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1351, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svint16_t v1090 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1088), v1395)); + svint16_t v1099 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1097), v1395)); + svint16_t v1108 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1106), v1395)); + svint16_t v1117 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1115), v1395)); + svint16_t v1126 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1124), v1395)); + svint16_t v1135 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1133), v1395)); + svint16_t v1144 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1142), v1395)); + svint16_t v1153 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1151), v1395)); + svint16_t v1162 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1160), v1395)); + svint16_t v1171 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1169), v1395)); + svint16_t v1180 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1178), v1395)); + svint16_t v1189 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1187), v1395)); + svint16_t v1198 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1196), v1395)); + svint16_t v1207 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1205), v1395)); + svint16_t v1216 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1214), v1395)); + svint16_t v1225 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1223), v1395)); + svint16_t v1234 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1232), v1395)); + svint16_t v1243 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1241), v1395)); + svint16_t v1252 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1250), v1395)); + svint16_t v1261 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1259), v1395)); + svint16_t v1270 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1268), v1395)); + svint16_t v1279 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1277), v1395)); + svint16_t v1288 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1286), v1395)); + svint16_t v1297 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1295), v1395)); + svint16_t v1306 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1304), v1395)); + svint16_t v1315 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1313), v1395)); + svint16_t v1324 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1322), v1395)); + svint16_t v1333 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1331), v1395)); + svint16_t v1342 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1340), v1395)); + svint16_t v1360 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1358), v1395)); + svint16_t v1369 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1367), v1395)); + svint16_t v1378 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1376), v1395)); + svint16_t v1387 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1385), v1395)); + svint16_t v1396 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1394), v1395)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1090, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1099, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v51 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1108, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v63 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1117, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v71 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1126, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v81 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1135, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v89 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1144, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v101 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1153, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v109 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1162, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v119 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1171, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v127 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1180, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v139 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1189, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v147 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1198, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v157 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1207, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v165 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1216, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v177 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1225, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v185 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1234, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v195 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1243, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v203 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1252, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v215 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1261, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v223 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1270, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v233 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1279, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v241 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1288, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v253 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1297, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v261 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1306, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v271 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1315, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v279 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1324, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v291 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1333, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v299 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1342, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v317 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1360, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v329 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1369, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v337 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1378, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v347 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1387, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v355 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1396, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v72 = svadd_f32_x(svptrue_b32(), v63, v71); + svfloat32_t v73 = svsub_f32_x(svptrue_b32(), v63, v71); + svfloat32_t v90 = svadd_f32_x(svptrue_b32(), v81, v89); + svfloat32_t v91 = svsub_f32_x(svptrue_b32(), v81, v89); + svfloat32_t v110 = svadd_f32_x(svptrue_b32(), v101, v109); + svfloat32_t v111 = svsub_f32_x(svptrue_b32(), v101, v109); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v119, v127); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v119, v127); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v139, v147); + svfloat32_t v149 = svsub_f32_x(svptrue_b32(), v139, v147); + svfloat32_t v166 = svadd_f32_x(svptrue_b32(), v157, v165); + svfloat32_t v167 = svsub_f32_x(svptrue_b32(), v157, v165); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v177, v185); + svfloat32_t v187 = svsub_f32_x(svptrue_b32(), v177, v185); + svfloat32_t v204 = svadd_f32_x(svptrue_b32(), v195, v203); + svfloat32_t v205 = svsub_f32_x(svptrue_b32(), v195, v203); + svfloat32_t v224 = svadd_f32_x(svptrue_b32(), v215, v223); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v215, v223); + svfloat32_t v242 = svadd_f32_x(svptrue_b32(), v233, v241); + svfloat32_t v243 = svsub_f32_x(svptrue_b32(), v233, v241); + svfloat32_t v262 = svadd_f32_x(svptrue_b32(), v253, v261); + svfloat32_t v263 = svsub_f32_x(svptrue_b32(), v253, v261); + svfloat32_t v280 = svadd_f32_x(svptrue_b32(), v271, v279); + svfloat32_t v281 = svsub_f32_x(svptrue_b32(), v271, v279); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v291, v299); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v291, v299); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v309, v317); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v309, v317); + svfloat32_t v338 = svadd_f32_x(svptrue_b32(), v329, v337); + svfloat32_t v339 = svsub_f32_x(svptrue_b32(), v329, v337); + svfloat32_t v356 = svadd_f32_x(svptrue_b32(), v347, v355); + svfloat32_t v357 = svsub_f32_x(svptrue_b32(), v347, v355); + svfloat32_t v54 = svadd_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v55 = svsub_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v92 = svadd_f32_x(svptrue_b32(), v72, v90); + svfloat32_t v93 = svsub_f32_x(svptrue_b32(), v72, v90); + svfloat32_t v130 = svadd_f32_x(svptrue_b32(), v110, v128); + svfloat32_t v131 = svsub_f32_x(svptrue_b32(), v110, v128); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v148, v166); + svfloat32_t v169 = svsub_f32_x(svptrue_b32(), v148, v166); + svfloat32_t v206 = svadd_f32_x(svptrue_b32(), v186, v204); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v186, v204); + svfloat32_t v244 = svadd_f32_x(svptrue_b32(), v224, v242); + svfloat32_t v245 = svsub_f32_x(svptrue_b32(), v224, v242); + svfloat32_t v282 = svadd_f32_x(svptrue_b32(), v262, v280); + svfloat32_t v283 = svsub_f32_x(svptrue_b32(), v262, v280); + svfloat32_t v320 = svadd_f32_x(svptrue_b32(), v300, v318); + svfloat32_t v321 = svsub_f32_x(svptrue_b32(), v300, v318); + svfloat32_t v358 = svadd_f32_x(svptrue_b32(), v338, v356); + svfloat32_t v359 = svsub_f32_x(svptrue_b32(), v338, v356); + svfloat32_t v580 = svadd_f32_x(svptrue_b32(), v73, v339); + svfloat32_t v581 = svsub_f32_x(svptrue_b32(), v73, v339); + svfloat32_t v582 = svadd_f32_x(svptrue_b32(), v301, v111); + svfloat32_t v583 = svsub_f32_x(svptrue_b32(), v301, v111); + svfloat32_t v584 = svadd_f32_x(svptrue_b32(), v149, v263); + svfloat32_t v585 = svsub_f32_x(svptrue_b32(), v149, v263); + svfloat32_t v586 = svadd_f32_x(svptrue_b32(), v187, v225); + svfloat32_t v587 = svsub_f32_x(svptrue_b32(), v187, v225); + svfloat32_t v690 = svadd_f32_x(svptrue_b32(), v91, v357); + svfloat32_t v691 = svsub_f32_x(svptrue_b32(), v91, v357); + svfloat32_t v692 = svadd_f32_x(svptrue_b32(), v319, v129); + svfloat32_t v693 = svsub_f32_x(svptrue_b32(), v319, v129); + svfloat32_t v694 = svadd_f32_x(svptrue_b32(), v167, v281); + svfloat32_t v695 = svsub_f32_x(svptrue_b32(), v167, v281); + svfloat32_t v696 = svadd_f32_x(svptrue_b32(), v205, v243); + svfloat32_t v697 = svsub_f32_x(svptrue_b32(), v205, v243); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v92, v358); + svfloat32_t v361 = svsub_f32_x(svptrue_b32(), v92, v358); + svfloat32_t v362 = svadd_f32_x(svptrue_b32(), v320, v130); + svfloat32_t v363 = svsub_f32_x(svptrue_b32(), v320, v130); + svfloat32_t v364 = svadd_f32_x(svptrue_b32(), v168, v282); + svfloat32_t v365 = svsub_f32_x(svptrue_b32(), v168, v282); + svfloat32_t v366 = svadd_f32_x(svptrue_b32(), v206, v244); + svfloat32_t v367 = svsub_f32_x(svptrue_b32(), v206, v244); + svfloat32_t v470 = svadd_f32_x(svptrue_b32(), v93, v359); + svfloat32_t v471 = svsub_f32_x(svptrue_b32(), v93, v359); + svfloat32_t v472 = svadd_f32_x(svptrue_b32(), v321, v131); + svfloat32_t v473 = svsub_f32_x(svptrue_b32(), v321, v131); + svfloat32_t v474 = svadd_f32_x(svptrue_b32(), v169, v283); + svfloat32_t v475 = svsub_f32_x(svptrue_b32(), v169, v283); + svfloat32_t v476 = svadd_f32_x(svptrue_b32(), v207, v245); + svfloat32_t v477 = svsub_f32_x(svptrue_b32(), v207, v245); + svfloat32_t v588 = svadd_f32_x(svptrue_b32(), v580, v582); + svfloat32_t v592 = svadd_f32_x(svptrue_b32(), v581, v583); + svfloat32_t v594 = svsub_f32_x(svptrue_b32(), v580, v582); + svfloat32_t v595 = svsub_f32_x(svptrue_b32(), v582, v586); + svfloat32_t v596 = svsub_f32_x(svptrue_b32(), v586, v580); + svfloat32_t v597 = svsub_f32_x(svptrue_b32(), v581, v583); + svfloat32_t v598 = svsub_f32_x(svptrue_b32(), v583, v587); + svfloat32_t v599 = svsub_f32_x(svptrue_b32(), v587, v581); + svfloat32_t zero628 = svdup_n_f32(0); + svfloat32_t v628 = svcmla_f32_x(pred_full, zero628, v1423, v585, 90); + svfloat32_t v698 = svadd_f32_x(svptrue_b32(), v690, v692); + svfloat32_t v702 = svadd_f32_x(svptrue_b32(), v691, v693); + svfloat32_t v704 = svsub_f32_x(svptrue_b32(), v690, v692); + svfloat32_t v705 = svsub_f32_x(svptrue_b32(), v692, v696); + svfloat32_t v706 = svsub_f32_x(svptrue_b32(), v696, v690); + svfloat32_t v707 = svsub_f32_x(svptrue_b32(), v691, v693); + svfloat32_t v708 = svsub_f32_x(svptrue_b32(), v693, v697); + svfloat32_t v709 = svsub_f32_x(svptrue_b32(), v697, v691); + svfloat32_t v368 = svadd_f32_x(svptrue_b32(), v360, v362); + svfloat32_t v372 = svadd_f32_x(svptrue_b32(), v361, v363); + svfloat32_t v374 = svsub_f32_x(svptrue_b32(), v360, v362); + svfloat32_t v375 = svsub_f32_x(svptrue_b32(), v362, v366); + svfloat32_t v376 = svsub_f32_x(svptrue_b32(), v366, v360); + svfloat32_t v377 = svsub_f32_x(svptrue_b32(), v361, v363); + svfloat32_t v378 = svsub_f32_x(svptrue_b32(), v363, v367); + svfloat32_t v379 = svsub_f32_x(svptrue_b32(), v367, v361); + svfloat32_t zero408 = svdup_n_f32(0); + svfloat32_t v408 = svcmla_f32_x(pred_full, zero408, v1423, v365, 90); + svfloat32_t v478 = svadd_f32_x(svptrue_b32(), v470, v472); + svfloat32_t v482 = svadd_f32_x(svptrue_b32(), v471, v473); + svfloat32_t v484 = svsub_f32_x(svptrue_b32(), v470, v472); + svfloat32_t v485 = svsub_f32_x(svptrue_b32(), v472, v476); + svfloat32_t v486 = svsub_f32_x(svptrue_b32(), v476, v470); + svfloat32_t v487 = svsub_f32_x(svptrue_b32(), v471, v473); + svfloat32_t v488 = svsub_f32_x(svptrue_b32(), v473, v477); + svfloat32_t v489 = svsub_f32_x(svptrue_b32(), v477, v471); + svfloat32_t zero518 = svdup_n_f32(0); + svfloat32_t v518 = svcmla_f32_x(pred_full, zero518, v1423, v475, 90); + svfloat32_t v589 = svadd_f32_x(svptrue_b32(), v588, v586); + svfloat32_t v593 = svadd_f32_x(svptrue_b32(), v592, v587); + svfloat32_t zero650 = svdup_n_f32(0); + svfloat32_t v650 = svcmla_f32_x(pred_full, zero650, v1427, v597, 90); + svfloat32_t zero657 = svdup_n_f32(0); + svfloat32_t v657 = svcmla_f32_x(pred_full, zero657, v1428, v598, 90); + svfloat32_t zero664 = svdup_n_f32(0); + svfloat32_t v664 = svcmla_f32_x(pred_full, zero664, v1429, v599, 90); + svfloat32_t v699 = svadd_f32_x(svptrue_b32(), v698, v696); + svfloat32_t v703 = svadd_f32_x(svptrue_b32(), v702, v697); + svfloat32_t zero747 = svdup_n_f32(0); + svfloat32_t v747 = svcmla_f32_x(pred_full, zero747, v1435, v704, 90); + svfloat32_t zero754 = svdup_n_f32(0); + svfloat32_t v754 = svcmla_f32_x(pred_full, zero754, v1436, v705, 90); + svfloat32_t zero761 = svdup_n_f32(0); + svfloat32_t v761 = svcmla_f32_x(pred_full, zero761, v1437, v706, 90); + svfloat32_t v766 = svmul_f32_x(svptrue_b32(), v707, v1438); + svfloat32_t v771 = svmul_f32_x(svptrue_b32(), v708, v1439); + svfloat32_t v369 = svadd_f32_x(svptrue_b32(), v368, v366); + svfloat32_t v373 = svadd_f32_x(svptrue_b32(), v372, v367); + svfloat32_t zero430 = svdup_n_f32(0); + svfloat32_t v430 = svcmla_f32_x(pred_full, zero430, v1427, v377, 90); + svfloat32_t zero437 = svdup_n_f32(0); + svfloat32_t v437 = svcmla_f32_x(pred_full, zero437, v1428, v378, 90); + svfloat32_t zero444 = svdup_n_f32(0); + svfloat32_t v444 = svcmla_f32_x(pred_full, zero444, v1429, v379, 90); + svfloat32_t v479 = svadd_f32_x(svptrue_b32(), v478, v476); + svfloat32_t v483 = svadd_f32_x(svptrue_b32(), v482, v477); + svfloat32_t zero540 = svdup_n_f32(0); + svfloat32_t v540 = svcmla_f32_x(pred_full, zero540, v1427, v487, 90); + svfloat32_t zero547 = svdup_n_f32(0); + svfloat32_t v547 = svcmla_f32_x(pred_full, zero547, v1428, v488, 90); + svfloat32_t zero554 = svdup_n_f32(0); + svfloat32_t v554 = svcmla_f32_x(pred_full, zero554, v1429, v489, 90); + svfloat32_t v590 = svadd_f32_x(svptrue_b32(), v589, v584); + svfloat32_t v609 = svmul_f32_x(svptrue_b32(), v589, v1420); + svfloat32_t zero616 = svdup_n_f32(0); + svfloat32_t v616 = svcmla_f32_x(pred_full, zero616, v1423, v593, 90); + svfloat32_t v678 = svadd_f32_x(svptrue_b32(), v628, v650); + svfloat32_t v680 = svsub_f32_x(svptrue_b32(), v628, v657); + svfloat32_t v682 = svsub_f32_x(svptrue_b32(), v628, v650); + svfloat32_t v700 = svadd_f32_x(svptrue_b32(), v699, v694); + svfloat32_t zero723 = svdup_n_f32(0); + svfloat32_t v723 = svcmla_f32_x(pred_full, zero723, v1431, v699, 90); + svfloat32_t v790 = svmla_f32_x(pred_full, v766, v695, v1434); + svfloat32_t v792 = svnmls_f32_x(pred_full, v771, v695, v1434); + svfloat32_t v794 = svnmls_f32_x(pred_full, v766, v695, v1434); + svfloat32_t v370 = svadd_f32_x(svptrue_b32(), v369, v364); + svfloat32_t v389 = svmul_f32_x(svptrue_b32(), v369, v1420); + svfloat32_t zero396 = svdup_n_f32(0); + svfloat32_t v396 = svcmla_f32_x(pred_full, zero396, v1423, v373, 90); + svfloat32_t v458 = svadd_f32_x(svptrue_b32(), v408, v430); + svfloat32_t v460 = svsub_f32_x(svptrue_b32(), v408, v437); + svfloat32_t v462 = svsub_f32_x(svptrue_b32(), v408, v430); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v479, v474); + svfloat32_t v499 = svmul_f32_x(svptrue_b32(), v479, v1420); + svfloat32_t zero506 = svdup_n_f32(0); + svfloat32_t v506 = svcmla_f32_x(pred_full, zero506, v1423, v483, 90); + svfloat32_t v568 = svadd_f32_x(svptrue_b32(), v518, v540); + svfloat32_t v570 = svsub_f32_x(svptrue_b32(), v518, v547); + svfloat32_t v572 = svsub_f32_x(svptrue_b32(), v518, v540); + svfloat32_t v591 = svadd_f32_x(svptrue_b32(), v590, v35); + svfloat32_t v665 = svadd_f32_x(svptrue_b32(), v609, v609); + svfloat32_t v679 = svadd_f32_x(svptrue_b32(), v678, v657); + svfloat32_t v681 = svadd_f32_x(svptrue_b32(), v680, v664); + svfloat32_t v683 = svsub_f32_x(svptrue_b32(), v682, v664); + svfloat32_t v701 = svadd_f32_x(svptrue_b32(), v700, v53); + svfloat32_t v777 = svadd_f32_x(svptrue_b32(), v723, v723); + svfloat32_t v791 = svmla_f32_x(pred_full, v790, v708, v1439); + svfloat32_t v793 = svmla_f32_x(pred_full, v792, v709, v1440); + svfloat32_t v795 = svmls_f32_x(pred_full, v794, v709, v1440); + svfloat32_t v371 = svadd_f32_x(svptrue_b32(), v370, v54); + svfloat32_t v445 = svadd_f32_x(svptrue_b32(), v389, v389); + svfloat32_t v459 = svadd_f32_x(svptrue_b32(), v458, v437); + svfloat32_t v461 = svadd_f32_x(svptrue_b32(), v460, v444); + svfloat32_t v463 = svsub_f32_x(svptrue_b32(), v462, v444); + svfloat32_t v481 = svadd_f32_x(svptrue_b32(), v480, v55); + svfloat32_t v555 = svadd_f32_x(svptrue_b32(), v499, v499); + svfloat32_t v569 = svadd_f32_x(svptrue_b32(), v568, v547); + svfloat32_t v571 = svadd_f32_x(svptrue_b32(), v570, v554); + svfloat32_t v573 = svsub_f32_x(svptrue_b32(), v572, v554); + svfloat32_t v666 = svmla_f32_x(pred_full, v665, v589, v1420); + svfloat32_t v670 = svmla_f32_x(pred_full, v591, v584, v1422); + svfloat32_t zero716 = svdup_n_f32(0); + svfloat32_t v716 = svcmla_f32_x(pred_full, zero716, v1430, v701, 90); + svfloat32_t v778 = svadd_f32_x(svptrue_b32(), v777, v723); + svfloat32_t v446 = svmla_f32_x(pred_full, v445, v369, v1420); + svfloat32_t v450 = svmla_f32_x(pred_full, v371, v364, v1422); + svfloat32_t v556 = svmla_f32_x(pred_full, v555, v479, v1420); + svfloat32_t v560 = svmla_f32_x(pred_full, v481, v474, v1422); + svfloat32_t v667 = svadd_f32_x(svptrue_b32(), v591, v666); + svfloat32_t v671 = svadd_f32_x(svptrue_b32(), v670, v665); + svfloat32_t v779 = svadd_f32_x(svptrue_b32(), v716, v778); + svfloat32_t v782 = svcmla_f32_x(pred_full, v716, v1433, v694, 90); + svfloat32_t v802 = svadd_f32_x(svptrue_b32(), v591, v716); + svfloat32_t v803 = svsub_f32_x(svptrue_b32(), v591, v716); + svst1_f64(pred_full, (double *)(v1448), svreinterpret_f64_f32(v371)); + svst1_f64(pred_full, (double *)(v1466), svreinterpret_f64_f32(v481)); + svfloat32_t v447 = svadd_f32_x(svptrue_b32(), v371, v446); + svfloat32_t v451 = svadd_f32_x(svptrue_b32(), v450, v445); + svfloat32_t v557 = svadd_f32_x(svptrue_b32(), v481, v556); + svfloat32_t v561 = svadd_f32_x(svptrue_b32(), v560, v555); + svfloat32_t v668 = svadd_f32_x(svptrue_b32(), v667, v616); + svfloat32_t v669 = svsub_f32_x(svptrue_b32(), v667, v616); + svfloat32_t v672 = svmla_f32_x(pred_full, v671, v594, v1424); + svfloat32_t v674 = svmls_f32_x(pred_full, v671, v595, v1425); + svfloat32_t v676 = svmls_f32_x(pred_full, v671, v594, v1424); + svfloat32_t v780 = svmla_f32_x(pred_full, v779, v703, v1434); + svfloat32_t v781 = svmls_f32_x(pred_full, v779, v703, v1434); + svfloat32_t v783 = svadd_f32_x(svptrue_b32(), v782, v777); + svst1_f64(pred_full, (double *)(v1457), svreinterpret_f64_f32(v803)); + svst1_f64(pred_full, (double *)(v1475), svreinterpret_f64_f32(v802)); + svfloat32_t v448 = svadd_f32_x(svptrue_b32(), v447, v396); + svfloat32_t v449 = svsub_f32_x(svptrue_b32(), v447, v396); + svfloat32_t v452 = svmla_f32_x(pred_full, v451, v374, v1424); + svfloat32_t v454 = svmls_f32_x(pred_full, v451, v375, v1425); + svfloat32_t v456 = svmls_f32_x(pred_full, v451, v374, v1424); + svfloat32_t v558 = svadd_f32_x(svptrue_b32(), v557, v506); + svfloat32_t v559 = svsub_f32_x(svptrue_b32(), v557, v506); + svfloat32_t v562 = svmla_f32_x(pred_full, v561, v484, v1424); + svfloat32_t v564 = svmls_f32_x(pred_full, v561, v485, v1425); + svfloat32_t v566 = svmls_f32_x(pred_full, v561, v484, v1424); + svfloat32_t v673 = svmla_f32_x(pred_full, v672, v595, v1425); + svfloat32_t v675 = svmla_f32_x(pred_full, v674, v596, v1426); + svfloat32_t v677 = svmls_f32_x(pred_full, v676, v596, v1426); + svfloat32_t v784 = svadd_f32_x(svptrue_b32(), v783, v747); + svfloat32_t v786 = svsub_f32_x(svptrue_b32(), v783, v754); + svfloat32_t v788 = svsub_f32_x(svptrue_b32(), v783, v747); + svfloat32_t v892 = svadd_f32_x(svptrue_b32(), v669, v781); + svfloat32_t v893 = svsub_f32_x(svptrue_b32(), v669, v781); + svfloat32_t v982 = svadd_f32_x(svptrue_b32(), v668, v780); + svfloat32_t v983 = svsub_f32_x(svptrue_b32(), v668, v780); + svfloat32_t v453 = svmla_f32_x(pred_full, v452, v375, v1425); + svfloat32_t v455 = svmla_f32_x(pred_full, v454, v376, v1426); + svfloat32_t v457 = svmls_f32_x(pred_full, v456, v376, v1426); + svfloat32_t v563 = svmla_f32_x(pred_full, v562, v485, v1425); + svfloat32_t v565 = svmla_f32_x(pred_full, v564, v486, v1426); + svfloat32_t v567 = svmls_f32_x(pred_full, v566, v486, v1426); + svfloat32_t v684 = svadd_f32_x(svptrue_b32(), v673, v679); + svfloat32_t v685 = svsub_f32_x(svptrue_b32(), v673, v679); + svfloat32_t v686 = svadd_f32_x(svptrue_b32(), v675, v681); + svfloat32_t v687 = svsub_f32_x(svptrue_b32(), v675, v681); + svfloat32_t v688 = svadd_f32_x(svptrue_b32(), v677, v683); + svfloat32_t v689 = svsub_f32_x(svptrue_b32(), v677, v683); + svfloat32_t v785 = svadd_f32_x(svptrue_b32(), v784, v754); + svfloat32_t v787 = svadd_f32_x(svptrue_b32(), v786, v761); + svfloat32_t v789 = svsub_f32_x(svptrue_b32(), v788, v761); + svst1_f64(pred_full, (double *)(v1556), svreinterpret_f64_f32(v449)); + svst1_f64(pred_full, (double *)(v1565), svreinterpret_f64_f32(v893)); + svst1_f64(pred_full, (double *)(v1574), svreinterpret_f64_f32(v559)); + svst1_f64(pred_full, (double *)(v1583), svreinterpret_f64_f32(v892)); + svst1_f64(pred_full, (double *)(v1664), svreinterpret_f64_f32(v448)); + svst1_f64(pred_full, (double *)(v1673), svreinterpret_f64_f32(v983)); + svst1_f64(pred_full, (double *)(v1682), svreinterpret_f64_f32(v558)); + svst1_f64(pred_full, (double *)(v1691), svreinterpret_f64_f32(v982)); + svfloat32_t v464 = svadd_f32_x(svptrue_b32(), v453, v459); + svfloat32_t v465 = svsub_f32_x(svptrue_b32(), v453, v459); + svfloat32_t v466 = svadd_f32_x(svptrue_b32(), v455, v461); + svfloat32_t v467 = svsub_f32_x(svptrue_b32(), v455, v461); + svfloat32_t v468 = svadd_f32_x(svptrue_b32(), v457, v463); + svfloat32_t v469 = svsub_f32_x(svptrue_b32(), v457, v463); + svfloat32_t v574 = svadd_f32_x(svptrue_b32(), v563, v569); + svfloat32_t v575 = svsub_f32_x(svptrue_b32(), v563, v569); + svfloat32_t v576 = svadd_f32_x(svptrue_b32(), v565, v571); + svfloat32_t v577 = svsub_f32_x(svptrue_b32(), v565, v571); + svfloat32_t v578 = svadd_f32_x(svptrue_b32(), v567, v573); + svfloat32_t v579 = svsub_f32_x(svptrue_b32(), v567, v573); + svfloat32_t v796 = svadd_f32_x(svptrue_b32(), v785, v791); + svfloat32_t v797 = svsub_f32_x(svptrue_b32(), v785, v791); + svfloat32_t v798 = svadd_f32_x(svptrue_b32(), v787, v793); + svfloat32_t v799 = svsub_f32_x(svptrue_b32(), v787, v793); + svfloat32_t v800 = svadd_f32_x(svptrue_b32(), v789, v795); + svfloat32_t v801 = svsub_f32_x(svptrue_b32(), v789, v795); + svfloat32_t v832 = svadd_f32_x(svptrue_b32(), v685, v797); + svfloat32_t v833 = svsub_f32_x(svptrue_b32(), v685, v797); + svfloat32_t v862 = svadd_f32_x(svptrue_b32(), v686, v798); + svfloat32_t v863 = svsub_f32_x(svptrue_b32(), v686, v798); + svfloat32_t v922 = svadd_f32_x(svptrue_b32(), v689, v801); + svfloat32_t v923 = svsub_f32_x(svptrue_b32(), v689, v801); + svfloat32_t v952 = svadd_f32_x(svptrue_b32(), v688, v800); + svfloat32_t v953 = svsub_f32_x(svptrue_b32(), v688, v800); + svfloat32_t v1012 = svadd_f32_x(svptrue_b32(), v687, v799); + svfloat32_t v1013 = svsub_f32_x(svptrue_b32(), v687, v799); + svfloat32_t v1042 = svadd_f32_x(svptrue_b32(), v684, v796); + svfloat32_t v1043 = svsub_f32_x(svptrue_b32(), v684, v796); + svst1_f64(pred_full, (double *)(v1484), svreinterpret_f64_f32(v465)); + svst1_f64(pred_full, (double *)(v1502), svreinterpret_f64_f32(v575)); + svst1_f64(pred_full, (double *)(v1520), svreinterpret_f64_f32(v466)); + svst1_f64(pred_full, (double *)(v1538), svreinterpret_f64_f32(v576)); + svst1_f64(pred_full, (double *)(v1592), svreinterpret_f64_f32(v469)); + svst1_f64(pred_full, (double *)(v1610), svreinterpret_f64_f32(v579)); + svst1_f64(pred_full, (double *)(v1628), svreinterpret_f64_f32(v468)); + svst1_f64(pred_full, (double *)(v1646), svreinterpret_f64_f32(v578)); + svst1_f64(pred_full, (double *)(v1700), svreinterpret_f64_f32(v467)); + svst1_f64(pred_full, (double *)(v1718), svreinterpret_f64_f32(v577)); + svst1_f64(pred_full, (double *)(v1736), svreinterpret_f64_f32(v464)); + svst1_f64(pred_full, (double *)(v1754), svreinterpret_f64_f32(v574)); + svst1_f64(pred_full, (double *)(v1493), svreinterpret_f64_f32(v833)); + svst1_f64(pred_full, (double *)(v1511), svreinterpret_f64_f32(v832)); + svst1_f64(pred_full, (double *)(v1529), svreinterpret_f64_f32(v863)); + svst1_f64(pred_full, (double *)(v1547), svreinterpret_f64_f32(v862)); + svst1_f64(pred_full, (double *)(v1601), svreinterpret_f64_f32(v923)); + svst1_f64(pred_full, (double *)(v1619), svreinterpret_f64_f32(v922)); + svst1_f64(pred_full, (double *)(v1637), svreinterpret_f64_f32(v953)); + svst1_f64(pred_full, (double *)(v1655), svreinterpret_f64_f32(v952)); + svst1_f64(pred_full, (double *)(v1709), svreinterpret_f64_f32(v1013)); + svst1_f64(pred_full, (double *)(v1727), svreinterpret_f64_f32(v1012)); + svst1_f64(pred_full, (double *)(v1745), svreinterpret_f64_f32(v1043)); + svst1_f64(pred_full, (double *)(v1763), svreinterpret_f64_f32(v1042)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu40(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v230 = vld1s_s16(&v5[istride]); + float v359 = 1.0000000000000000e+00F; + float v360 = -1.0000000000000000e+00F; + float v367 = -7.0710678118654746e-01F; + float v374 = 7.0710678118654757e-01F; + float v426 = -1.2500000000000000e+00F; + float v427 = 1.2500000000000000e+00F; + float v434 = 8.8388347648318433e-01F; + float v441 = -8.8388347648318444e-01F; + float v493 = 5.5901699437494745e-01F; + float v494 = -5.5901699437494745e-01F; + float v501 = -3.9528470752104738e-01F; + float v508 = 3.9528470752104744e-01F; + float v562 = 1.5388417685876268e+00F; + float v570 = -1.5388417685876268e+00F; + float v577 = 1.0881254497414108e+00F; + float v578 = -1.0881254497414108e+00F; + float v635 = 5.8778525229247325e-01F; + float v643 = -5.8778525229247325e-01F; + float v650 = 4.1562693777745352e-01F; + float v651 = -4.1562693777745352e-01F; + float v708 = 3.6327126400268028e-01F; + float v716 = -3.6327126400268028e-01F; + float v723 = 2.5687157418650380e-01F; + float v724 = -2.5687157418650380e-01F; + float32x2_t v726 = (float32x2_t){v4, v4}; + int16x4_t v51 = vld1s_s16(&v5[0]); + float32x2_t v231 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v230)), 15); + float32x2_t v361 = (float32x2_t){v359, v360}; + float32x2_t v368 = (float32x2_t){v374, v367}; + float32x2_t v375 = (float32x2_t){v374, v374}; + float32x2_t v424 = (float32x2_t){v426, v426}; + float32x2_t v428 = (float32x2_t){v426, v427}; + float32x2_t v435 = (float32x2_t){v441, v434}; + float32x2_t v442 = (float32x2_t){v441, v441}; + float32x2_t v491 = (float32x2_t){v493, v493}; + float32x2_t v495 = (float32x2_t){v493, v494}; + float32x2_t v502 = (float32x2_t){v508, v501}; + float32x2_t v509 = (float32x2_t){v508, v508}; + float32x2_t v564 = (float32x2_t){v562, v570}; + float32x2_t v571 = (float32x2_t){v570, v570}; + float32x2_t v575 = (float32x2_t){v578, v578}; + float32x2_t v579 = (float32x2_t){v577, v578}; + float32x2_t v637 = (float32x2_t){v635, v643}; + float32x2_t v644 = (float32x2_t){v643, v643}; + float32x2_t v648 = (float32x2_t){v651, v651}; + float32x2_t v652 = (float32x2_t){v650, v651}; + float32x2_t v710 = (float32x2_t){v708, v716}; + float32x2_t v717 = (float32x2_t){v716, v716}; + float32x2_t v721 = (float32x2_t){v724, v724}; + float32x2_t v725 = (float32x2_t){v723, v724}; + int16x4_t v20 = vld1s_s16(&v5[istride * 8]); + int16x4_t v26 = vld1s_s16(&v5[istride * 32]); + int16x4_t v34 = vld1s_s16(&v5[istride * 24]); + int16x4_t v40 = vld1s_s16(&v5[istride * 16]); + float32x2_t v52 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v51)), 15); + int16x4_t v58 = vld1s_s16(&v5[istride * 13]); + int16x4_t v64 = vld1s_s16(&v5[istride * 37]); + int16x4_t v72 = vld1s_s16(&v5[istride * 29]); + int16x4_t v78 = vld1s_s16(&v5[istride * 21]); + int16x4_t v89 = vld1s_s16(&v5[istride * 5]); + int16x4_t v96 = vld1s_s16(&v5[istride * 18]); + int16x4_t v102 = vld1s_s16(&v5[istride * 2]); + int16x4_t v110 = vld1s_s16(&v5[istride * 34]); + int16x4_t v116 = vld1s_s16(&v5[istride * 26]); + int16x4_t v127 = vld1s_s16(&v5[istride * 10]); + int16x4_t v134 = vld1s_s16(&v5[istride * 23]); + int16x4_t v140 = vld1s_s16(&v5[istride * 7]); + int16x4_t v148 = vld1s_s16(&v5[istride * 39]); + int16x4_t v154 = vld1s_s16(&v5[istride * 31]); + int16x4_t v165 = vld1s_s16(&v5[istride * 15]); + int16x4_t v172 = vld1s_s16(&v5[istride * 28]); + int16x4_t v178 = vld1s_s16(&v5[istride * 12]); + int16x4_t v186 = vld1s_s16(&v5[istride * 4]); + int16x4_t v192 = vld1s_s16(&v5[istride * 36]); + int16x4_t v203 = vld1s_s16(&v5[istride * 20]); + int16x4_t v210 = vld1s_s16(&v5[istride * 33]); + int16x4_t v216 = vld1s_s16(&v5[istride * 17]); + int16x4_t v224 = vld1s_s16(&v5[istride * 9]); + int16x4_t v241 = vld1s_s16(&v5[istride * 25]); + int16x4_t v248 = vld1s_s16(&v5[istride * 38]); + int16x4_t v254 = vld1s_s16(&v5[istride * 22]); + int16x4_t v262 = vld1s_s16(&v5[istride * 14]); + int16x4_t v268 = vld1s_s16(&v5[istride * 6]); + int16x4_t v279 = vld1s_s16(&v5[istride * 30]); + int16x4_t v286 = vld1s_s16(&v5[istride * 3]); + int16x4_t v292 = vld1s_s16(&v5[istride * 27]); + int16x4_t v300 = vld1s_s16(&v5[istride * 19]); + int16x4_t v306 = vld1s_s16(&v5[istride * 11]); + int16x4_t v317 = vld1s_s16(&v5[istride * 35]); + float32x2_t v363 = vmul_f32(v726, v361); + float32x2_t v370 = vmul_f32(v726, v368); + float32x2_t v430 = vmul_f32(v726, v428); + float32x2_t v437 = vmul_f32(v726, v435); + float32x2_t v497 = vmul_f32(v726, v495); + float32x2_t v504 = vmul_f32(v726, v502); + float32x2_t v566 = vmul_f32(v726, v564); + float32x2_t v581 = vmul_f32(v726, v579); + float32x2_t v639 = vmul_f32(v726, v637); + float32x2_t v654 = vmul_f32(v726, v652); + float32x2_t v712 = vmul_f32(v726, v710); + float32x2_t v727 = vmul_f32(v726, v725); + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v59 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v58)), 15); + float32x2_t v65 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v64)), 15); + float32x2_t v73 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v72)), 15); + float32x2_t v79 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v78)), 15); + float32x2_t v90 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v89)), 15); + float32x2_t v97 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v96)), 15); + float32x2_t v103 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v102)), 15); + float32x2_t v111 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v110)), 15); + float32x2_t v117 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v116)), 15); + float32x2_t v128 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v127)), 15); + float32x2_t v135 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v134)), 15); + float32x2_t v141 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v140)), 15); + float32x2_t v149 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v148)), 15); + float32x2_t v155 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v154)), 15); + float32x2_t v166 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v165)), 15); + float32x2_t v173 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v172)), 15); + float32x2_t v179 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v178)), 15); + float32x2_t v187 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v186)), 15); + float32x2_t v193 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v192)), 15); + float32x2_t v204 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v203)), 15); + float32x2_t v211 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v210)), 15); + float32x2_t v217 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v216)), 15); + float32x2_t v225 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v224)), 15); + float32x2_t v242 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v241)), 15); + float32x2_t v249 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v248)), 15); + float32x2_t v255 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v254)), 15); + float32x2_t v263 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v262)), 15); + float32x2_t v269 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v268)), 15); + float32x2_t v280 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v279)), 15); + float32x2_t v287 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v286)), 15); + float32x2_t v293 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v292)), 15); + float32x2_t v301 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v300)), 15); + float32x2_t v307 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v306)), 15); + float32x2_t v318 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v317)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v66 = vadd_f32(v59, v65); + float32x2_t v67 = vsub_f32(v59, v65); + float32x2_t v80 = vadd_f32(v73, v79); + float32x2_t v81 = vsub_f32(v73, v79); + float32x2_t v104 = vadd_f32(v97, v103); + float32x2_t v105 = vsub_f32(v97, v103); + float32x2_t v118 = vadd_f32(v111, v117); + float32x2_t v119 = vsub_f32(v111, v117); + float32x2_t v142 = vadd_f32(v135, v141); + float32x2_t v143 = vsub_f32(v135, v141); + float32x2_t v156 = vadd_f32(v149, v155); + float32x2_t v157 = vsub_f32(v149, v155); + float32x2_t v180 = vadd_f32(v173, v179); + float32x2_t v181 = vsub_f32(v173, v179); + float32x2_t v194 = vadd_f32(v187, v193); + float32x2_t v195 = vsub_f32(v187, v193); + float32x2_t v218 = vadd_f32(v211, v217); + float32x2_t v219 = vsub_f32(v211, v217); + float32x2_t v232 = vadd_f32(v225, v231); + float32x2_t v233 = vsub_f32(v225, v231); + float32x2_t v256 = vadd_f32(v249, v255); + float32x2_t v257 = vsub_f32(v249, v255); + float32x2_t v270 = vadd_f32(v263, v269); + float32x2_t v271 = vsub_f32(v263, v269); + float32x2_t v294 = vadd_f32(v287, v293); + float32x2_t v295 = vsub_f32(v287, v293); + float32x2_t v308 = vadd_f32(v301, v307); + float32x2_t v309 = vsub_f32(v301, v307); + float32x2_t v44 = vadd_f32(v28, v42); + float32x2_t v45 = vsub_f32(v28, v42); + float32x2_t v46 = vadd_f32(v29, v43); + float32x2_t v82 = vadd_f32(v66, v80); + float32x2_t v83 = vsub_f32(v66, v80); + float32x2_t v84 = vadd_f32(v67, v81); + float32x2_t v120 = vadd_f32(v104, v118); + float32x2_t v121 = vsub_f32(v104, v118); + float32x2_t v122 = vadd_f32(v105, v119); + float32x2_t v158 = vadd_f32(v142, v156); + float32x2_t v159 = vsub_f32(v142, v156); + float32x2_t v160 = vadd_f32(v143, v157); + float32x2_t v196 = vadd_f32(v180, v194); + float32x2_t v197 = vsub_f32(v180, v194); + float32x2_t v198 = vadd_f32(v181, v195); + float32x2_t v234 = vadd_f32(v218, v232); + float32x2_t v235 = vsub_f32(v218, v232); + float32x2_t v236 = vadd_f32(v219, v233); + float32x2_t v272 = vadd_f32(v256, v270); + float32x2_t v273 = vsub_f32(v256, v270); + float32x2_t v274 = vadd_f32(v257, v271); + float32x2_t v310 = vadd_f32(v294, v308); + float32x2_t v311 = vsub_f32(v294, v308); + float32x2_t v312 = vadd_f32(v295, v309); + float32x2_t v521 = vadd_f32(v29, v181); + float32x2_t v522 = vsub_f32(v29, v181); + float32x2_t v523 = vadd_f32(v105, v257); + float32x2_t v524 = vsub_f32(v105, v257); + float32x2_t v525 = vadd_f32(v67, v219); + float32x2_t v526 = vsub_f32(v67, v219); + float32x2_t v527 = vadd_f32(v143, v295); + float32x2_t v528 = vsub_f32(v143, v295); + float32x2_t v667 = vadd_f32(v43, v195); + float32x2_t v668 = vsub_f32(v43, v195); + float32x2_t v669 = vadd_f32(v119, v271); + float32x2_t v670 = vsub_f32(v119, v271); + float32x2_t v671 = vadd_f32(v81, v233); + float32x2_t v672 = vsub_f32(v81, v233); + float32x2_t v673 = vadd_f32(v157, v309); + float32x2_t v674 = vsub_f32(v157, v309); + float32x2_t v53 = vadd_f32(v44, v52); + float32x2_t v91 = vadd_f32(v82, v90); + float32x2_t v129 = vadd_f32(v120, v128); + float32x2_t v167 = vadd_f32(v158, v166); + float32x2_t v205 = vadd_f32(v196, v204); + float32x2_t v243 = vadd_f32(v234, v242); + float32x2_t v281 = vadd_f32(v272, v280); + float32x2_t v319 = vadd_f32(v310, v318); + float32x2_t v387 = vadd_f32(v44, v196); + float32x2_t v388 = vsub_f32(v44, v196); + float32x2_t v389 = vadd_f32(v120, v272); + float32x2_t v390 = vsub_f32(v120, v272); + float32x2_t v391 = vadd_f32(v82, v234); + float32x2_t v392 = vsub_f32(v82, v234); + float32x2_t v393 = vadd_f32(v158, v310); + float32x2_t v394 = vsub_f32(v158, v310); + float32x2_t v454 = vadd_f32(v45, v197); + float32x2_t v455 = vsub_f32(v45, v197); + float32x2_t v456 = vadd_f32(v121, v273); + float32x2_t v457 = vsub_f32(v121, v273); + float32x2_t v458 = vadd_f32(v83, v235); + float32x2_t v459 = vsub_f32(v83, v235); + float32x2_t v460 = vadd_f32(v159, v311); + float32x2_t v461 = vsub_f32(v159, v311); + float32x2_t v529 = vadd_f32(v521, v523); + float32x2_t v530 = vsub_f32(v521, v523); + float32x2_t v531 = vadd_f32(v525, v527); + float32x2_t v532 = vsub_f32(v525, v527); + float32x2_t v535 = vadd_f32(v526, v528); + float32x2_t v536 = vsub_f32(v526, v528); + float32x2_t v567 = vrev64_f32(v522); + float32x2_t v572 = vmul_f32(v524, v571); + float32x2_t v594 = vadd_f32(v46, v198); + float32x2_t v595 = vsub_f32(v46, v198); + float32x2_t v596 = vadd_f32(v122, v274); + float32x2_t v597 = vsub_f32(v122, v274); + float32x2_t v598 = vadd_f32(v84, v236); + float32x2_t v599 = vsub_f32(v84, v236); + float32x2_t v600 = vadd_f32(v160, v312); + float32x2_t v601 = vsub_f32(v160, v312); + float32x2_t v675 = vadd_f32(v667, v669); + float32x2_t v676 = vsub_f32(v667, v669); + float32x2_t v677 = vadd_f32(v671, v673); + float32x2_t v678 = vsub_f32(v671, v673); + float32x2_t v681 = vadd_f32(v672, v674); + float32x2_t v682 = vsub_f32(v672, v674); + float32x2_t v713 = vrev64_f32(v668); + float32x2_t v718 = vmul_f32(v670, v717); + float32x2_t v320 = vadd_f32(v53, v205); + float32x2_t v321 = vsub_f32(v53, v205); + float32x2_t v322 = vadd_f32(v129, v281); + float32x2_t v323 = vsub_f32(v129, v281); + float32x2_t v324 = vadd_f32(v91, v243); + float32x2_t v325 = vsub_f32(v91, v243); + float32x2_t v326 = vadd_f32(v167, v319); + float32x2_t v327 = vsub_f32(v167, v319); + float32x2_t v395 = vadd_f32(v387, v389); + float32x2_t v396 = vsub_f32(v387, v389); + float32x2_t v397 = vadd_f32(v391, v393); + float32x2_t v398 = vsub_f32(v391, v393); + float32x2_t v401 = vadd_f32(v392, v394); + float32x2_t v402 = vsub_f32(v392, v394); + float32x2_t v425 = vmul_f32(v388, v424); + float32x2_t v431 = vrev64_f32(v390); + float32x2_t v462 = vadd_f32(v454, v456); + float32x2_t v463 = vsub_f32(v454, v456); + float32x2_t v464 = vadd_f32(v458, v460); + float32x2_t v465 = vsub_f32(v458, v460); + float32x2_t v468 = vadd_f32(v459, v461); + float32x2_t v469 = vsub_f32(v459, v461); + float32x2_t v492 = vmul_f32(v455, v491); + float32x2_t v498 = vrev64_f32(v457); + float32x2_t v533 = vadd_f32(v529, v531); + float32x2_t v534 = vsub_f32(v529, v531); + float32x2_t v556 = vrev64_f32(v530); + float32x2_t v561 = vmul_f32(v532, v571); + float32x2_t v568 = vmul_f32(v567, v566); + float32x2_t v576 = vmul_f32(v535, v575); + float32x2_t v582 = vrev64_f32(v536); + float32x2_t v602 = vadd_f32(v594, v596); + float32x2_t v603 = vsub_f32(v594, v596); + float32x2_t v604 = vadd_f32(v598, v600); + float32x2_t v605 = vsub_f32(v598, v600); + float32x2_t v608 = vadd_f32(v599, v601); + float32x2_t v609 = vsub_f32(v599, v601); + float32x2_t v640 = vrev64_f32(v595); + float32x2_t v645 = vmul_f32(v597, v644); + float32x2_t v679 = vadd_f32(v675, v677); + float32x2_t v680 = vsub_f32(v675, v677); + float32x2_t v702 = vrev64_f32(v676); + float32x2_t v707 = vmul_f32(v678, v717); + float32x2_t v714 = vmul_f32(v713, v712); + float32x2_t v722 = vmul_f32(v681, v721); + float32x2_t v728 = vrev64_f32(v682); + float32x2_t v328 = vadd_f32(v320, v322); + float32x2_t v329 = vsub_f32(v320, v322); + float32x2_t v330 = vadd_f32(v324, v326); + float32x2_t v331 = vsub_f32(v324, v326); + float32x2_t v334 = vadd_f32(v325, v327); + float32x2_t v335 = vsub_f32(v325, v327); + float32x2_t v364 = vrev64_f32(v323); + float32x2_t v399 = vadd_f32(v395, v397); + float32x2_t v400 = vsub_f32(v395, v397); + float32x2_t v414 = vmul_f32(v396, v424); + float32x2_t v420 = vrev64_f32(v398); + float32x2_t v432 = vmul_f32(v431, v430); + float32x2_t v438 = vrev64_f32(v401); + float32x2_t v443 = vmul_f32(v402, v442); + float32x2_t v466 = vadd_f32(v462, v464); + float32x2_t v467 = vsub_f32(v462, v464); + float32x2_t v481 = vmul_f32(v463, v491); + float32x2_t v487 = vrev64_f32(v465); + float32x2_t v499 = vmul_f32(v498, v497); + float32x2_t v505 = vrev64_f32(v468); + float32x2_t v510 = vmul_f32(v469, v509); + float32x2_t v542 = vrev64_f32(v533); + float32x2_t v549 = vrev64_f32(v534); + float32x2_t v557 = vmul_f32(v556, v566); + float32x2_t v583 = vmul_f32(v582, v581); + float32x2_t v588 = vadd_f32(v572, v576); + float32x2_t v589 = vsub_f32(v572, v576); + float32x2_t v606 = vadd_f32(v602, v604); + float32x2_t v607 = vsub_f32(v602, v604); + float32x2_t v629 = vrev64_f32(v603); + float32x2_t v634 = vmul_f32(v605, v644); + float32x2_t v641 = vmul_f32(v640, v639); + float32x2_t v649 = vmul_f32(v608, v648); + float32x2_t v655 = vrev64_f32(v609); + float32x2_t v688 = vrev64_f32(v679); + float32x2_t v695 = vrev64_f32(v680); + float32x2_t v703 = vmul_f32(v702, v712); + float32x2_t v729 = vmul_f32(v728, v727); + float32x2_t v734 = vadd_f32(v718, v722); + float32x2_t v735 = vsub_f32(v718, v722); + float32x2_t v332 = vadd_f32(v328, v330); + float32x2_t v333 = vsub_f32(v328, v330); + float32x2_t v353 = vrev64_f32(v331); + float32x2_t v365 = vmul_f32(v364, v363); + float32x2_t v371 = vrev64_f32(v334); + float32x2_t v376 = vmul_f32(v335, v375); + float32x2_t v406 = vmul_f32(v399, v424); + float32x2_t v410 = vmul_f32(v400, v424); + float32x2_t v421 = vmul_f32(v420, v430); + float32x2_t v439 = vmul_f32(v438, v437); + float32x2_t v446 = vadd_f32(v425, v443); + float32x2_t v447 = vsub_f32(v425, v443); + float32x2_t v473 = vmul_f32(v466, v491); + float32x2_t v477 = vmul_f32(v467, v491); + float32x2_t v488 = vmul_f32(v487, v497); + float32x2_t v506 = vmul_f32(v505, v504); + float32x2_t v513 = vadd_f32(v492, v510); + float32x2_t v514 = vsub_f32(v492, v510); + float32x2_t v543 = vmul_f32(v542, v566); + float32x2_t v550 = vmul_f32(v549, v566); + float32x2_t v584 = vadd_f32(v557, v561); + float32x2_t v585 = vsub_f32(v557, v561); + float32x2_t v586 = vadd_f32(v568, v583); + float32x2_t v587 = vsub_f32(v568, v583); + float32x2_t v615 = vrev64_f32(v606); + float32x2_t v622 = vrev64_f32(v607); + float32x2_t v630 = vmul_f32(v629, v639); + float32x2_t v656 = vmul_f32(v655, v654); + float32x2_t v661 = vadd_f32(v645, v649); + float32x2_t v662 = vsub_f32(v645, v649); + float32x2_t v689 = vmul_f32(v688, v712); + float32x2_t v696 = vmul_f32(v695, v712); + float32x2_t v730 = vadd_f32(v703, v707); + float32x2_t v731 = vsub_f32(v703, v707); + float32x2_t v732 = vadd_f32(v714, v729); + float32x2_t v733 = vsub_f32(v714, v729); + float32x2_t v354 = vmul_f32(v353, v363); + float32x2_t v372 = vmul_f32(v371, v370); + float32x2_t v379 = vadd_f32(v321, v376); + float32x2_t v380 = vsub_f32(v321, v376); + float32x2_t v444 = vadd_f32(v414, v421); + float32x2_t v445 = vsub_f32(v414, v421); + float32x2_t v448 = vadd_f32(v432, v439); + float32x2_t v449 = vsub_f32(v432, v439); + float32x2_t v511 = vadd_f32(v481, v488); + float32x2_t v512 = vsub_f32(v481, v488); + float32x2_t v515 = vadd_f32(v499, v506); + float32x2_t v516 = vsub_f32(v499, v506); + float32x2_t v590 = vadd_f32(v586, v588); + float32x2_t v591 = vsub_f32(v586, v588); + float32x2_t v592 = vadd_f32(v587, v589); + float32x2_t v593 = vsub_f32(v587, v589); + float32x2_t v616 = vmul_f32(v615, v639); + float32x2_t v623 = vmul_f32(v622, v639); + float32x2_t v657 = vadd_f32(v630, v634); + float32x2_t v658 = vsub_f32(v630, v634); + float32x2_t v659 = vadd_f32(v641, v656); + float32x2_t v660 = vsub_f32(v641, v656); + float32x2_t v736 = vadd_f32(v732, v734); + float32x2_t v737 = vsub_f32(v732, v734); + float32x2_t v738 = vadd_f32(v733, v735); + float32x2_t v739 = vsub_f32(v733, v735); + float32x2_t v740 = vadd_f32(v332, v406); + v6[0] = v332; + float32x2_t v876 = vadd_f32(v333, v410); + v6[ostride * 20] = v333; + float32x2_t v377 = vadd_f32(v329, v354); + float32x2_t v378 = vsub_f32(v329, v354); + float32x2_t v381 = vadd_f32(v365, v372); + float32x2_t v382 = vsub_f32(v365, v372); + float32x2_t v450 = vadd_f32(v446, v448); + float32x2_t v451 = vsub_f32(v446, v448); + float32x2_t v452 = vadd_f32(v447, v449); + float32x2_t v453 = vsub_f32(v447, v449); + float32x2_t v517 = vadd_f32(v513, v515); + float32x2_t v518 = vsub_f32(v513, v515); + float32x2_t v519 = vadd_f32(v514, v516); + float32x2_t v520 = vsub_f32(v514, v516); + float32x2_t v663 = vadd_f32(v659, v661); + float32x2_t v664 = vsub_f32(v659, v661); + float32x2_t v665 = vadd_f32(v660, v662); + float32x2_t v666 = vsub_f32(v660, v662); + float32x2_t v741 = vadd_f32(v740, v473); + float32x2_t v742 = vsub_f32(v740, v473); + float32x2_t v743 = vsub_f32(v543, v616); + float32x2_t v744 = vadd_f32(v616, v689); + float32x2_t v811 = vsub_f32(v585, v658); + float32x2_t v812 = vadd_f32(v658, v731); + float32x2_t v877 = vadd_f32(v876, v477); + float32x2_t v878 = vsub_f32(v876, v477); + float32x2_t v879 = vsub_f32(v550, v623); + float32x2_t v880 = vadd_f32(v623, v696); + float32x2_t v947 = vsub_f32(v584, v657); + float32x2_t v948 = vadd_f32(v657, v730); + float32x2_t v383 = vadd_f32(v379, v381); + float32x2_t v384 = vsub_f32(v379, v381); + float32x2_t v385 = vadd_f32(v380, v382); + float32x2_t v386 = vsub_f32(v380, v382); + float32x2_t v745 = vadd_f32(v741, v743); + float32x2_t v746 = vsub_f32(v741, v743); + float32x2_t v747 = vadd_f32(v742, v744); + float32x2_t v748 = vsub_f32(v742, v744); + float32x2_t v777 = vsub_f32(v591, v664); + float32x2_t v778 = vadd_f32(v664, v737); + float32x2_t v808 = vadd_f32(v378, v445); + v6[ostride * 10] = v378; + float32x2_t v845 = vsub_f32(v592, v665); + float32x2_t v846 = vadd_f32(v665, v738); + float32x2_t v881 = vadd_f32(v877, v879); + float32x2_t v882 = vsub_f32(v877, v879); + float32x2_t v883 = vadd_f32(v878, v880); + float32x2_t v884 = vsub_f32(v878, v880); + float32x2_t v913 = vsub_f32(v593, v666); + float32x2_t v914 = vadd_f32(v666, v739); + float32x2_t v944 = vadd_f32(v377, v444); + v6[ostride * 30] = v377; + float32x2_t v981 = vsub_f32(v590, v663); + float32x2_t v982 = vadd_f32(v663, v736); + v6[ostride * 16] = v746; + v6[ostride * 32] = v748; + v6[ostride * 8] = v747; + v6[ostride * 24] = v745; + float32x2_t v774 = vadd_f32(v384, v451); + v6[ostride * 25] = v384; + float32x2_t v809 = vadd_f32(v808, v512); + float32x2_t v810 = vsub_f32(v808, v512); + float32x2_t v842 = vadd_f32(v385, v452); + v6[ostride * 35] = v385; + v6[ostride * 36] = v882; + v6[ostride * 12] = v884; + v6[ostride * 28] = v883; + v6[ostride * 4] = v881; + float32x2_t v910 = vadd_f32(v386, v453); + v6[ostride * 5] = v386; + float32x2_t v945 = vadd_f32(v944, v511); + float32x2_t v946 = vsub_f32(v944, v511); + float32x2_t v978 = vadd_f32(v383, v450); + v6[ostride * 15] = v383; + float32x2_t v775 = vadd_f32(v774, v518); + float32x2_t v776 = vsub_f32(v774, v518); + float32x2_t v813 = vadd_f32(v809, v811); + float32x2_t v814 = vsub_f32(v809, v811); + float32x2_t v815 = vadd_f32(v810, v812); + float32x2_t v816 = vsub_f32(v810, v812); + float32x2_t v843 = vadd_f32(v842, v519); + float32x2_t v844 = vsub_f32(v842, v519); + float32x2_t v911 = vadd_f32(v910, v520); + float32x2_t v912 = vsub_f32(v910, v520); + float32x2_t v949 = vadd_f32(v945, v947); + float32x2_t v950 = vsub_f32(v945, v947); + float32x2_t v951 = vadd_f32(v946, v948); + float32x2_t v952 = vsub_f32(v946, v948); + float32x2_t v979 = vadd_f32(v978, v517); + float32x2_t v980 = vsub_f32(v978, v517); + float32x2_t v779 = vadd_f32(v775, v777); + float32x2_t v780 = vsub_f32(v775, v777); + float32x2_t v781 = vadd_f32(v776, v778); + float32x2_t v782 = vsub_f32(v776, v778); + v6[ostride * 26] = v814; + v6[ostride * 2] = v816; + v6[ostride * 18] = v815; + v6[ostride * 34] = v813; + float32x2_t v847 = vadd_f32(v843, v845); + float32x2_t v848 = vsub_f32(v843, v845); + float32x2_t v849 = vadd_f32(v844, v846); + float32x2_t v850 = vsub_f32(v844, v846); + float32x2_t v915 = vadd_f32(v911, v913); + float32x2_t v916 = vsub_f32(v911, v913); + float32x2_t v917 = vadd_f32(v912, v914); + float32x2_t v918 = vsub_f32(v912, v914); + v6[ostride * 6] = v950; + v6[ostride * 22] = v952; + v6[ostride * 38] = v951; + v6[ostride * 14] = v949; + float32x2_t v983 = vadd_f32(v979, v981); + float32x2_t v984 = vsub_f32(v979, v981); + float32x2_t v985 = vadd_f32(v980, v982); + float32x2_t v986 = vsub_f32(v980, v982); + v6[ostride] = v780; + v6[ostride * 17] = v782; + v6[ostride * 33] = v781; + v6[ostride * 9] = v779; + v6[ostride * 11] = v848; + v6[ostride * 27] = v850; + v6[ostride * 3] = v849; + v6[ostride * 19] = v847; + v6[ostride * 21] = v916; + v6[ostride * 37] = v918; + v6[ostride * 13] = v917; + v6[ostride * 29] = v915; + v6[ostride * 31] = v984; + v6[ostride * 7] = v986; + v6[ostride * 23] = v985; + v6[ostride * 39] = v983; + v5 += 1 * idist; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cf32_ac_n_gu40(const armral_cmplx_int16_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + int idist, float dir) { + int64_t v0 = istride; + int64_t v1 = idist; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * v1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v446 = -1.0000000000000000e+00F; + float v453 = -7.0710678118654746e-01F; + float v460 = 7.0710678118654757e-01F; + float v513 = -1.2500000000000000e+00F; + float v518 = 1.2500000000000000e+00F; + float v525 = 8.8388347648318433e-01F; + float v532 = -8.8388347648318444e-01F; + float v585 = 5.5901699437494745e-01F; + float v590 = -5.5901699437494745e-01F; + float v597 = -3.9528470752104738e-01F; + float v604 = 3.9528470752104744e-01F; + float v668 = -1.5388417685876268e+00F; + float v678 = -1.0881254497414108e+00F; + float v744 = -5.8778525229247325e-01F; + float v754 = -4.1562693777745352e-01F; + float v820 = -3.6327126400268028e-01F; + float v830 = -2.5687157418650380e-01F; + const int32_t *v1457 = &v5[v0]; + float32x2_t *v1668 = &v6[v2]; + int64_t v19 = v0 * 8; + int64_t v27 = v0 * 32; + int64_t v37 = v0 * 24; + int64_t v45 = v0 * 16; + int64_t v67 = v0 * 13; + int64_t v75 = v0 * 37; + int64_t v85 = v0 * 29; + int64_t v93 = v0 * 21; + int64_t v106 = v0 * 5; + int64_t v115 = v0 * 18; + int64_t v123 = v0 * 2; + int64_t v133 = v0 * 34; + int64_t v141 = v0 * 26; + int64_t v154 = v0 * 10; + int64_t v163 = v0 * 23; + int64_t v171 = v0 * 7; + int64_t v181 = v0 * 39; + int64_t v189 = v0 * 31; + int64_t v202 = v0 * 15; + int64_t v211 = v0 * 28; + int64_t v219 = v0 * 12; + int64_t v229 = v0 * 4; + int64_t v237 = v0 * 36; + int64_t v250 = v0 * 20; + int64_t v259 = v0 * 33; + int64_t v267 = v0 * 17; + int64_t v277 = v0 * 9; + int64_t v298 = v0 * 25; + int64_t v307 = v0 * 38; + int64_t v315 = v0 * 22; + int64_t v325 = v0 * 14; + int64_t v333 = v0 * 6; + int64_t v346 = v0 * 30; + int64_t v355 = v0 * 3; + int64_t v363 = v0 * 27; + int64_t v373 = v0 * 19; + int64_t v381 = v0 * 11; + int64_t v394 = v0 * 35; + float v449 = v4 * v446; + float v456 = v4 * v453; + float v521 = v4 * v518; + float v528 = v4 * v525; + float v593 = v4 * v590; + float v600 = v4 * v597; + float v664 = v4 * v668; + float v681 = v4 * v678; + float v740 = v4 * v744; + float v757 = v4 * v754; + float v816 = v4 * v820; + float v833 = v4 * v830; + int64_t v863 = v2 * 16; + int64_t v870 = v2 * 32; + int64_t v877 = v2 * 8; + int64_t v884 = v2 * 24; + int64_t v900 = v2 * 25; + int64_t v914 = v2 * 17; + int64_t v921 = v2 * 33; + int64_t v928 = v2 * 9; + int64_t v944 = v2 * 10; + int64_t v951 = v2 * 26; + int64_t v958 = v2 * 2; + int64_t v965 = v2 * 18; + int64_t v972 = v2 * 34; + int64_t v988 = v2 * 35; + int64_t v995 = v2 * 11; + int64_t v1002 = v2 * 27; + int64_t v1009 = v2 * 3; + int64_t v1016 = v2 * 19; + int64_t v1032 = v2 * 20; + int64_t v1039 = v2 * 36; + int64_t v1046 = v2 * 12; + int64_t v1053 = v2 * 28; + int64_t v1060 = v2 * 4; + int64_t v1076 = v2 * 5; + int64_t v1083 = v2 * 21; + int64_t v1090 = v2 * 37; + int64_t v1097 = v2 * 13; + int64_t v1104 = v2 * 29; + int64_t v1120 = v2 * 30; + int64_t v1127 = v2 * 6; + int64_t v1134 = v2 * 22; + int64_t v1141 = v2 * 38; + int64_t v1148 = v2 * 14; + int64_t v1164 = v2 * 15; + int64_t v1171 = v2 * 31; + int64_t v1178 = v2 * 7; + int64_t v1185 = v2 * 23; + int64_t v1192 = v2 * 39; + const int32_t *v1241 = &v5[0]; + svint64_t v1557 = svindex_s64(0, v1); + svfloat32_t v1566 = svdup_n_f32(v460); + svfloat32_t v1571 = svdup_n_f32(v513); + svfloat32_t v1574 = svdup_n_f32(v532); + svfloat32_t v1579 = svdup_n_f32(v585); + svfloat32_t v1582 = svdup_n_f32(v604); + svfloat32_t v1588 = svdup_n_f32(v668); + svfloat32_t v1589 = svdup_n_f32(v678); + svfloat32_t v1596 = svdup_n_f32(v744); + svfloat32_t v1597 = svdup_n_f32(v754); + svfloat32_t v1604 = svdup_n_f32(v820); + svfloat32_t v1605 = svdup_n_f32(v830); + float32x2_t *v1614 = &v6[0]; + const int32_t *v1204 = &v5[v19]; + const int32_t *v1213 = &v5[v27]; + const int32_t *v1222 = &v5[v37]; + const int32_t *v1231 = &v5[v45]; + svint16_t v1243 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1241), v1557)); + const int32_t *v1250 = &v5[v67]; + const int32_t *v1259 = &v5[v75]; + const int32_t *v1268 = &v5[v85]; + const int32_t *v1277 = &v5[v93]; + const int32_t *v1286 = &v5[v106]; + const int32_t *v1295 = &v5[v115]; + const int32_t *v1304 = &v5[v123]; + const int32_t *v1313 = &v5[v133]; + const int32_t *v1322 = &v5[v141]; + const int32_t *v1331 = &v5[v154]; + const int32_t *v1340 = &v5[v163]; + const int32_t *v1349 = &v5[v171]; + const int32_t *v1358 = &v5[v181]; + const int32_t *v1367 = &v5[v189]; + const int32_t *v1376 = &v5[v202]; + const int32_t *v1385 = &v5[v211]; + const int32_t *v1394 = &v5[v219]; + const int32_t *v1403 = &v5[v229]; + const int32_t *v1412 = &v5[v237]; + const int32_t *v1421 = &v5[v250]; + const int32_t *v1430 = &v5[v259]; + const int32_t *v1439 = &v5[v267]; + const int32_t *v1448 = &v5[v277]; + svint16_t v1459 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1457), v1557)); + const int32_t *v1466 = &v5[v298]; + const int32_t *v1475 = &v5[v307]; + const int32_t *v1484 = &v5[v315]; + const int32_t *v1493 = &v5[v325]; + const int32_t *v1502 = &v5[v333]; + const int32_t *v1511 = &v5[v346]; + const int32_t *v1520 = &v5[v355]; + const int32_t *v1529 = &v5[v363]; + const int32_t *v1538 = &v5[v373]; + const int32_t *v1547 = &v5[v381]; + const int32_t *v1556 = &v5[v394]; + svfloat32_t v1564 = svdup_n_f32(v449); + svfloat32_t v1565 = svdup_n_f32(v456); + svfloat32_t v1572 = svdup_n_f32(v521); + svfloat32_t v1573 = svdup_n_f32(v528); + svfloat32_t v1580 = svdup_n_f32(v593); + svfloat32_t v1581 = svdup_n_f32(v600); + svfloat32_t v1587 = svdup_n_f32(v664); + svfloat32_t v1590 = svdup_n_f32(v681); + svfloat32_t v1595 = svdup_n_f32(v740); + svfloat32_t v1598 = svdup_n_f32(v757); + svfloat32_t v1603 = svdup_n_f32(v816); + svfloat32_t v1606 = svdup_n_f32(v833); + float32x2_t *v1623 = &v6[v863]; + float32x2_t *v1632 = &v6[v870]; + float32x2_t *v1641 = &v6[v877]; + float32x2_t *v1650 = &v6[v884]; + float32x2_t *v1659 = &v6[v900]; + float32x2_t *v1677 = &v6[v914]; + float32x2_t *v1686 = &v6[v921]; + float32x2_t *v1695 = &v6[v928]; + float32x2_t *v1704 = &v6[v944]; + float32x2_t *v1713 = &v6[v951]; + float32x2_t *v1722 = &v6[v958]; + float32x2_t *v1731 = &v6[v965]; + float32x2_t *v1740 = &v6[v972]; + float32x2_t *v1749 = &v6[v988]; + float32x2_t *v1758 = &v6[v995]; + float32x2_t *v1767 = &v6[v1002]; + float32x2_t *v1776 = &v6[v1009]; + float32x2_t *v1785 = &v6[v1016]; + float32x2_t *v1794 = &v6[v1032]; + float32x2_t *v1803 = &v6[v1039]; + float32x2_t *v1812 = &v6[v1046]; + float32x2_t *v1821 = &v6[v1053]; + float32x2_t *v1830 = &v6[v1060]; + float32x2_t *v1839 = &v6[v1076]; + float32x2_t *v1848 = &v6[v1083]; + float32x2_t *v1857 = &v6[v1090]; + float32x2_t *v1866 = &v6[v1097]; + float32x2_t *v1875 = &v6[v1104]; + float32x2_t *v1884 = &v6[v1120]; + float32x2_t *v1893 = &v6[v1127]; + float32x2_t *v1902 = &v6[v1134]; + float32x2_t *v1911 = &v6[v1141]; + float32x2_t *v1920 = &v6[v1148]; + float32x2_t *v1929 = &v6[v1164]; + float32x2_t *v1938 = &v6[v1171]; + float32x2_t *v1947 = &v6[v1178]; + float32x2_t *v1956 = &v6[v1185]; + float32x2_t *v1965 = &v6[v1192]; + svfloat32_t v64 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1243, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v291 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1459, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svint16_t v1206 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1204), v1557)); + svint16_t v1215 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1213), v1557)); + svint16_t v1224 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1222), v1557)); + svint16_t v1233 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1231), v1557)); + svint16_t v1252 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1250), v1557)); + svint16_t v1261 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1259), v1557)); + svint16_t v1270 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1268), v1557)); + svint16_t v1279 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1277), v1557)); + svint16_t v1288 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1286), v1557)); + svint16_t v1297 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1295), v1557)); + svint16_t v1306 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1304), v1557)); + svint16_t v1315 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1313), v1557)); + svint16_t v1324 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1322), v1557)); + svint16_t v1333 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1331), v1557)); + svint16_t v1342 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1340), v1557)); + svint16_t v1351 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1349), v1557)); + svint16_t v1360 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1358), v1557)); + svint16_t v1369 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1367), v1557)); + svint16_t v1378 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1376), v1557)); + svint16_t v1387 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1385), v1557)); + svint16_t v1396 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1394), v1557)); + svint16_t v1405 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1403), v1557)); + svint16_t v1414 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1412), v1557)); + svint16_t v1423 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1421), v1557)); + svint16_t v1432 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1430), v1557)); + svint16_t v1441 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1439), v1557)); + svint16_t v1450 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1448), v1557)); + svint16_t v1468 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1466), v1557)); + svint16_t v1477 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1475), v1557)); + svint16_t v1486 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1484), v1557)); + svint16_t v1495 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1493), v1557)); + svint16_t v1504 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1502), v1557)); + svint16_t v1513 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1511), v1557)); + svint16_t v1522 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1520), v1557)); + svint16_t v1531 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1529), v1557)); + svint16_t v1540 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1538), v1557)); + svint16_t v1549 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1547), v1557)); + svint16_t v1558 = svreinterpret_s16_u64(svld1uw_gather_s64index_u64( + pred_full, (const unsigned *)(v1556), v1557)); + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1206, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1215, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1224, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v51 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1233, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v73 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1252, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v81 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1261, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v91 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1270, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v99 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1279, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v112 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1288, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v121 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1297, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v129 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1306, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v139 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1315, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v147 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1324, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v160 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1333, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v169 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1342, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v177 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1351, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v187 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1360, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v195 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1369, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v208 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1378, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v217 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1387, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v225 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1396, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v235 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1405, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v243 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1414, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v256 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1423, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v265 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1432, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v273 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1441, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v283 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1450, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v304 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1468, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v313 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1477, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v321 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1486, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v331 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1495, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v339 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1504, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v352 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1513, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v361 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1522, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v369 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1531, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v379 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1540, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v387 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1549, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v400 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, svreinterpret_s32_s16(svtbl_s16( + v1558, svreinterpret_u16_u64(svindex_u64( + 0x0001ffff0000ffffULL, + 0x0004000000040000ULL))))), + 1.F / (1ULL << 31ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v82 = svadd_f32_x(svptrue_b32(), v73, v81); + svfloat32_t v83 = svsub_f32_x(svptrue_b32(), v73, v81); + svfloat32_t v100 = svadd_f32_x(svptrue_b32(), v91, v99); + svfloat32_t v101 = svsub_f32_x(svptrue_b32(), v91, v99); + svfloat32_t v130 = svadd_f32_x(svptrue_b32(), v121, v129); + svfloat32_t v131 = svsub_f32_x(svptrue_b32(), v121, v129); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v139, v147); + svfloat32_t v149 = svsub_f32_x(svptrue_b32(), v139, v147); + svfloat32_t v178 = svadd_f32_x(svptrue_b32(), v169, v177); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v169, v177); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v187, v195); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v187, v195); + svfloat32_t v226 = svadd_f32_x(svptrue_b32(), v217, v225); + svfloat32_t v227 = svsub_f32_x(svptrue_b32(), v217, v225); + svfloat32_t v244 = svadd_f32_x(svptrue_b32(), v235, v243); + svfloat32_t v245 = svsub_f32_x(svptrue_b32(), v235, v243); + svfloat32_t v274 = svadd_f32_x(svptrue_b32(), v265, v273); + svfloat32_t v275 = svsub_f32_x(svptrue_b32(), v265, v273); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v283, v291); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v283, v291); + svfloat32_t v322 = svadd_f32_x(svptrue_b32(), v313, v321); + svfloat32_t v323 = svsub_f32_x(svptrue_b32(), v313, v321); + svfloat32_t v340 = svadd_f32_x(svptrue_b32(), v331, v339); + svfloat32_t v341 = svsub_f32_x(svptrue_b32(), v331, v339); + svfloat32_t v370 = svadd_f32_x(svptrue_b32(), v361, v369); + svfloat32_t v371 = svsub_f32_x(svptrue_b32(), v361, v369); + svfloat32_t v388 = svadd_f32_x(svptrue_b32(), v379, v387); + svfloat32_t v389 = svsub_f32_x(svptrue_b32(), v379, v387); + svfloat32_t v54 = svadd_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v55 = svsub_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v56 = svadd_f32_x(svptrue_b32(), v35, v53); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v82, v100); + svfloat32_t v103 = svsub_f32_x(svptrue_b32(), v82, v100); + svfloat32_t v104 = svadd_f32_x(svptrue_b32(), v83, v101); + svfloat32_t v150 = svadd_f32_x(svptrue_b32(), v130, v148); + svfloat32_t v151 = svsub_f32_x(svptrue_b32(), v130, v148); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v131, v149); + svfloat32_t v198 = svadd_f32_x(svptrue_b32(), v178, v196); + svfloat32_t v199 = svsub_f32_x(svptrue_b32(), v178, v196); + svfloat32_t v200 = svadd_f32_x(svptrue_b32(), v179, v197); + svfloat32_t v246 = svadd_f32_x(svptrue_b32(), v226, v244); + svfloat32_t v247 = svsub_f32_x(svptrue_b32(), v226, v244); + svfloat32_t v248 = svadd_f32_x(svptrue_b32(), v227, v245); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v274, v292); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v274, v292); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v275, v293); + svfloat32_t v342 = svadd_f32_x(svptrue_b32(), v322, v340); + svfloat32_t v343 = svsub_f32_x(svptrue_b32(), v322, v340); + svfloat32_t v344 = svadd_f32_x(svptrue_b32(), v323, v341); + svfloat32_t v390 = svadd_f32_x(svptrue_b32(), v370, v388); + svfloat32_t v391 = svsub_f32_x(svptrue_b32(), v370, v388); + svfloat32_t v392 = svadd_f32_x(svptrue_b32(), v371, v389); + svfloat32_t v618 = svadd_f32_x(svptrue_b32(), v35, v227); + svfloat32_t v619 = svsub_f32_x(svptrue_b32(), v35, v227); + svfloat32_t v620 = svadd_f32_x(svptrue_b32(), v131, v323); + svfloat32_t v621 = svsub_f32_x(svptrue_b32(), v131, v323); + svfloat32_t v622 = svadd_f32_x(svptrue_b32(), v83, v275); + svfloat32_t v623 = svsub_f32_x(svptrue_b32(), v83, v275); + svfloat32_t v624 = svadd_f32_x(svptrue_b32(), v179, v371); + svfloat32_t v625 = svsub_f32_x(svptrue_b32(), v179, v371); + svfloat32_t v770 = svadd_f32_x(svptrue_b32(), v53, v245); + svfloat32_t v771 = svsub_f32_x(svptrue_b32(), v53, v245); + svfloat32_t v772 = svadd_f32_x(svptrue_b32(), v149, v341); + svfloat32_t v773 = svsub_f32_x(svptrue_b32(), v149, v341); + svfloat32_t v774 = svadd_f32_x(svptrue_b32(), v101, v293); + svfloat32_t v775 = svsub_f32_x(svptrue_b32(), v101, v293); + svfloat32_t v776 = svadd_f32_x(svptrue_b32(), v197, v389); + svfloat32_t v777 = svsub_f32_x(svptrue_b32(), v197, v389); + svfloat32_t v65 = svadd_f32_x(svptrue_b32(), v54, v64); + svfloat32_t v113 = svadd_f32_x(svptrue_b32(), v102, v112); + svfloat32_t v161 = svadd_f32_x(svptrue_b32(), v150, v160); + svfloat32_t v209 = svadd_f32_x(svptrue_b32(), v198, v208); + svfloat32_t v257 = svadd_f32_x(svptrue_b32(), v246, v256); + svfloat32_t v305 = svadd_f32_x(svptrue_b32(), v294, v304); + svfloat32_t v353 = svadd_f32_x(svptrue_b32(), v342, v352); + svfloat32_t v401 = svadd_f32_x(svptrue_b32(), v390, v400); + svfloat32_t v474 = svadd_f32_x(svptrue_b32(), v54, v246); + svfloat32_t v475 = svsub_f32_x(svptrue_b32(), v54, v246); + svfloat32_t v476 = svadd_f32_x(svptrue_b32(), v150, v342); + svfloat32_t v477 = svsub_f32_x(svptrue_b32(), v150, v342); + svfloat32_t v478 = svadd_f32_x(svptrue_b32(), v102, v294); + svfloat32_t v479 = svsub_f32_x(svptrue_b32(), v102, v294); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v198, v390); + svfloat32_t v481 = svsub_f32_x(svptrue_b32(), v198, v390); + svfloat32_t v546 = svadd_f32_x(svptrue_b32(), v55, v247); + svfloat32_t v547 = svsub_f32_x(svptrue_b32(), v55, v247); + svfloat32_t v548 = svadd_f32_x(svptrue_b32(), v151, v343); + svfloat32_t v549 = svsub_f32_x(svptrue_b32(), v151, v343); + svfloat32_t v550 = svadd_f32_x(svptrue_b32(), v103, v295); + svfloat32_t v551 = svsub_f32_x(svptrue_b32(), v103, v295); + svfloat32_t v552 = svadd_f32_x(svptrue_b32(), v199, v391); + svfloat32_t v553 = svsub_f32_x(svptrue_b32(), v199, v391); + svfloat32_t v626 = svadd_f32_x(svptrue_b32(), v618, v620); + svfloat32_t v627 = svsub_f32_x(svptrue_b32(), v618, v620); + svfloat32_t v628 = svadd_f32_x(svptrue_b32(), v622, v624); + svfloat32_t v629 = svsub_f32_x(svptrue_b32(), v622, v624); + svfloat32_t v632 = svadd_f32_x(svptrue_b32(), v623, v625); + svfloat32_t v633 = svsub_f32_x(svptrue_b32(), v623, v625); + svfloat32_t zero666 = svdup_n_f32(0); + svfloat32_t v666 = svcmla_f32_x(pred_full, zero666, v1587, v619, 90); + svfloat32_t v694 = svadd_f32_x(svptrue_b32(), v56, v248); + svfloat32_t v695 = svsub_f32_x(svptrue_b32(), v56, v248); + svfloat32_t v696 = svadd_f32_x(svptrue_b32(), v152, v344); + svfloat32_t v697 = svsub_f32_x(svptrue_b32(), v152, v344); + svfloat32_t v698 = svadd_f32_x(svptrue_b32(), v104, v296); + svfloat32_t v699 = svsub_f32_x(svptrue_b32(), v104, v296); + svfloat32_t v700 = svadd_f32_x(svptrue_b32(), v200, v392); + svfloat32_t v701 = svsub_f32_x(svptrue_b32(), v200, v392); + svfloat32_t v778 = svadd_f32_x(svptrue_b32(), v770, v772); + svfloat32_t v779 = svsub_f32_x(svptrue_b32(), v770, v772); + svfloat32_t v780 = svadd_f32_x(svptrue_b32(), v774, v776); + svfloat32_t v781 = svsub_f32_x(svptrue_b32(), v774, v776); + svfloat32_t v784 = svadd_f32_x(svptrue_b32(), v775, v777); + svfloat32_t v785 = svsub_f32_x(svptrue_b32(), v775, v777); + svfloat32_t zero818 = svdup_n_f32(0); + svfloat32_t v818 = svcmla_f32_x(pred_full, zero818, v1603, v771, 90); + svfloat32_t v402 = svadd_f32_x(svptrue_b32(), v65, v257); + svfloat32_t v403 = svsub_f32_x(svptrue_b32(), v65, v257); + svfloat32_t v404 = svadd_f32_x(svptrue_b32(), v161, v353); + svfloat32_t v405 = svsub_f32_x(svptrue_b32(), v161, v353); + svfloat32_t v406 = svadd_f32_x(svptrue_b32(), v113, v305); + svfloat32_t v407 = svsub_f32_x(svptrue_b32(), v113, v305); + svfloat32_t v408 = svadd_f32_x(svptrue_b32(), v209, v401); + svfloat32_t v409 = svsub_f32_x(svptrue_b32(), v209, v401); + svfloat32_t v482 = svadd_f32_x(svptrue_b32(), v474, v476); + svfloat32_t v483 = svsub_f32_x(svptrue_b32(), v474, v476); + svfloat32_t v484 = svadd_f32_x(svptrue_b32(), v478, v480); + svfloat32_t v485 = svsub_f32_x(svptrue_b32(), v478, v480); + svfloat32_t v488 = svadd_f32_x(svptrue_b32(), v479, v481); + svfloat32_t v489 = svsub_f32_x(svptrue_b32(), v479, v481); + svfloat32_t zero523 = svdup_n_f32(0); + svfloat32_t v523 = svcmla_f32_x(pred_full, zero523, v1572, v477, 90); + svfloat32_t v554 = svadd_f32_x(svptrue_b32(), v546, v548); + svfloat32_t v555 = svsub_f32_x(svptrue_b32(), v546, v548); + svfloat32_t v556 = svadd_f32_x(svptrue_b32(), v550, v552); + svfloat32_t v557 = svsub_f32_x(svptrue_b32(), v550, v552); + svfloat32_t v560 = svadd_f32_x(svptrue_b32(), v551, v553); + svfloat32_t v561 = svsub_f32_x(svptrue_b32(), v551, v553); + svfloat32_t zero595 = svdup_n_f32(0); + svfloat32_t v595 = svcmla_f32_x(pred_full, zero595, v1580, v549, 90); + svfloat32_t v630 = svadd_f32_x(svptrue_b32(), v626, v628); + svfloat32_t v631 = svsub_f32_x(svptrue_b32(), v626, v628); + svfloat32_t zero654 = svdup_n_f32(0); + svfloat32_t v654 = svcmla_f32_x(pred_full, zero654, v1587, v627, 90); + svfloat32_t v676 = svmul_f32_x(svptrue_b32(), v632, v1589); + svfloat32_t zero683 = svdup_n_f32(0); + svfloat32_t v683 = svcmla_f32_x(pred_full, zero683, v1590, v633, 90); + svfloat32_t v702 = svadd_f32_x(svptrue_b32(), v694, v696); + svfloat32_t v703 = svsub_f32_x(svptrue_b32(), v694, v696); + svfloat32_t v704 = svadd_f32_x(svptrue_b32(), v698, v700); + svfloat32_t v705 = svsub_f32_x(svptrue_b32(), v698, v700); + svfloat32_t v708 = svadd_f32_x(svptrue_b32(), v699, v701); + svfloat32_t v709 = svsub_f32_x(svptrue_b32(), v699, v701); + svfloat32_t zero742 = svdup_n_f32(0); + svfloat32_t v742 = svcmla_f32_x(pred_full, zero742, v1595, v695, 90); + svfloat32_t v782 = svadd_f32_x(svptrue_b32(), v778, v780); + svfloat32_t v783 = svsub_f32_x(svptrue_b32(), v778, v780); + svfloat32_t zero806 = svdup_n_f32(0); + svfloat32_t v806 = svcmla_f32_x(pred_full, zero806, v1603, v779, 90); + svfloat32_t v828 = svmul_f32_x(svptrue_b32(), v784, v1605); + svfloat32_t zero835 = svdup_n_f32(0); + svfloat32_t v835 = svcmla_f32_x(pred_full, zero835, v1606, v785, 90); + svfloat32_t v410 = svadd_f32_x(svptrue_b32(), v402, v404); + svfloat32_t v411 = svsub_f32_x(svptrue_b32(), v402, v404); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v406, v408); + svfloat32_t v413 = svsub_f32_x(svptrue_b32(), v406, v408); + svfloat32_t v416 = svadd_f32_x(svptrue_b32(), v407, v409); + svfloat32_t v417 = svsub_f32_x(svptrue_b32(), v407, v409); + svfloat32_t zero451 = svdup_n_f32(0); + svfloat32_t v451 = svcmla_f32_x(pred_full, zero451, v1564, v405, 90); + svfloat32_t v486 = svadd_f32_x(svptrue_b32(), v482, v484); + svfloat32_t v487 = svsub_f32_x(svptrue_b32(), v482, v484); + svfloat32_t zero511 = svdup_n_f32(0); + svfloat32_t v511 = svcmla_f32_x(pred_full, zero511, v1572, v485, 90); + svfloat32_t zero530 = svdup_n_f32(0); + svfloat32_t v530 = svcmla_f32_x(pred_full, zero530, v1573, v488, 90); + svfloat32_t v535 = svmul_f32_x(svptrue_b32(), v489, v1574); + svfloat32_t v558 = svadd_f32_x(svptrue_b32(), v554, v556); + svfloat32_t v559 = svsub_f32_x(svptrue_b32(), v554, v556); + svfloat32_t zero583 = svdup_n_f32(0); + svfloat32_t v583 = svcmla_f32_x(pred_full, zero583, v1580, v557, 90); + svfloat32_t zero602 = svdup_n_f32(0); + svfloat32_t v602 = svcmla_f32_x(pred_full, zero602, v1581, v560, 90); + svfloat32_t v607 = svmul_f32_x(svptrue_b32(), v561, v1582); + svfloat32_t zero640 = svdup_n_f32(0); + svfloat32_t v640 = svcmla_f32_x(pred_full, zero640, v1587, v630, 90); + svfloat32_t zero647 = svdup_n_f32(0); + svfloat32_t v647 = svcmla_f32_x(pred_full, zero647, v1587, v631, 90); + svfloat32_t v684 = svmla_f32_x(pred_full, v654, v629, v1588); + svfloat32_t v685 = svmls_f32_x(pred_full, v654, v629, v1588); + svfloat32_t v686 = svadd_f32_x(svptrue_b32(), v666, v683); + svfloat32_t v687 = svsub_f32_x(svptrue_b32(), v666, v683); + svfloat32_t v688 = svmla_f32_x(pred_full, v676, v621, v1588); + svfloat32_t v689 = svnmls_f32_x(pred_full, v676, v621, v1588); + svfloat32_t v706 = svadd_f32_x(svptrue_b32(), v702, v704); + svfloat32_t v707 = svsub_f32_x(svptrue_b32(), v702, v704); + svfloat32_t zero730 = svdup_n_f32(0); + svfloat32_t v730 = svcmla_f32_x(pred_full, zero730, v1595, v703, 90); + svfloat32_t v752 = svmul_f32_x(svptrue_b32(), v708, v1597); + svfloat32_t zero759 = svdup_n_f32(0); + svfloat32_t v759 = svcmla_f32_x(pred_full, zero759, v1598, v709, 90); + svfloat32_t v836 = svmla_f32_x(pred_full, v806, v781, v1604); + svfloat32_t v837 = svmls_f32_x(pred_full, v806, v781, v1604); + svfloat32_t v838 = svadd_f32_x(svptrue_b32(), v818, v835); + svfloat32_t v839 = svsub_f32_x(svptrue_b32(), v818, v835); + svfloat32_t v840 = svmla_f32_x(pred_full, v828, v773, v1604); + svfloat32_t v841 = svnmls_f32_x(pred_full, v828, v773, v1604); + svfloat32_t v414 = svadd_f32_x(svptrue_b32(), v410, v412); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v410, v412); + svfloat32_t zero439 = svdup_n_f32(0); + svfloat32_t v439 = svcmla_f32_x(pred_full, zero439, v1564, v413, 90); + svfloat32_t zero458 = svdup_n_f32(0); + svfloat32_t v458 = svcmla_f32_x(pred_full, zero458, v1565, v416, 90); + svfloat32_t v536 = svmla_f32_x(pred_full, v511, v483, v1571); + svfloat32_t v537 = svnmls_f32_x(pred_full, v511, v483, v1571); + svfloat32_t v538 = svmla_f32_x(pred_full, v535, v475, v1571); + svfloat32_t v539 = svnmls_f32_x(pred_full, v535, v475, v1571); + svfloat32_t v540 = svadd_f32_x(svptrue_b32(), v523, v530); + svfloat32_t v541 = svsub_f32_x(svptrue_b32(), v523, v530); + svfloat32_t v608 = svmla_f32_x(pred_full, v583, v555, v1579); + svfloat32_t v609 = svnmls_f32_x(pred_full, v583, v555, v1579); + svfloat32_t v610 = svmla_f32_x(pred_full, v607, v547, v1579); + svfloat32_t v611 = svnmls_f32_x(pred_full, v607, v547, v1579); + svfloat32_t v612 = svadd_f32_x(svptrue_b32(), v595, v602); + svfloat32_t v613 = svsub_f32_x(svptrue_b32(), v595, v602); + svfloat32_t v690 = svadd_f32_x(svptrue_b32(), v686, v688); + svfloat32_t v691 = svsub_f32_x(svptrue_b32(), v686, v688); + svfloat32_t v692 = svadd_f32_x(svptrue_b32(), v687, v689); + svfloat32_t v693 = svsub_f32_x(svptrue_b32(), v687, v689); + svfloat32_t zero716 = svdup_n_f32(0); + svfloat32_t v716 = svcmla_f32_x(pred_full, zero716, v1595, v706, 90); + svfloat32_t zero723 = svdup_n_f32(0); + svfloat32_t v723 = svcmla_f32_x(pred_full, zero723, v1595, v707, 90); + svfloat32_t v760 = svmla_f32_x(pred_full, v730, v705, v1596); + svfloat32_t v761 = svmls_f32_x(pred_full, v730, v705, v1596); + svfloat32_t v762 = svadd_f32_x(svptrue_b32(), v742, v759); + svfloat32_t v763 = svsub_f32_x(svptrue_b32(), v742, v759); + svfloat32_t v764 = svmla_f32_x(pred_full, v752, v697, v1596); + svfloat32_t v765 = svnmls_f32_x(pred_full, v752, v697, v1596); + svfloat32_t v842 = svadd_f32_x(svptrue_b32(), v838, v840); + svfloat32_t v843 = svsub_f32_x(svptrue_b32(), v838, v840); + svfloat32_t v844 = svadd_f32_x(svptrue_b32(), v839, v841); + svfloat32_t v845 = svsub_f32_x(svptrue_b32(), v839, v841); + svfloat32_t v464 = svadd_f32_x(svptrue_b32(), v411, v439); + svfloat32_t v465 = svsub_f32_x(svptrue_b32(), v411, v439); + svfloat32_t v466 = svmla_f32_x(pred_full, v403, v417, v1566); + svfloat32_t v467 = svmls_f32_x(pred_full, v403, v417, v1566); + svfloat32_t v468 = svadd_f32_x(svptrue_b32(), v451, v458); + svfloat32_t v469 = svsub_f32_x(svptrue_b32(), v451, v458); + svfloat32_t v542 = svadd_f32_x(svptrue_b32(), v538, v540); + svfloat32_t v543 = svsub_f32_x(svptrue_b32(), v538, v540); + svfloat32_t v544 = svadd_f32_x(svptrue_b32(), v539, v541); + svfloat32_t v545 = svsub_f32_x(svptrue_b32(), v539, v541); + svfloat32_t v614 = svadd_f32_x(svptrue_b32(), v610, v612); + svfloat32_t v615 = svsub_f32_x(svptrue_b32(), v610, v612); + svfloat32_t v616 = svadd_f32_x(svptrue_b32(), v611, v613); + svfloat32_t v617 = svsub_f32_x(svptrue_b32(), v611, v613); + svfloat32_t v766 = svadd_f32_x(svptrue_b32(), v762, v764); + svfloat32_t v767 = svsub_f32_x(svptrue_b32(), v762, v764); + svfloat32_t v768 = svadd_f32_x(svptrue_b32(), v763, v765); + svfloat32_t v769 = svsub_f32_x(svptrue_b32(), v763, v765); + svfloat32_t v846 = svmla_f32_x(pred_full, v414, v486, v1571); + svfloat32_t v849 = svsub_f32_x(svptrue_b32(), v640, v716); + svfloat32_t v850 = svcmla_f32_x(pred_full, v716, v1603, v782, 90); + svfloat32_t v937 = svsub_f32_x(svptrue_b32(), v685, v761); + svfloat32_t v938 = svadd_f32_x(svptrue_b32(), v761, v837); + svfloat32_t v1022 = svmla_f32_x(pred_full, v415, v487, v1571); + svfloat32_t v1025 = svsub_f32_x(svptrue_b32(), v647, v723); + svfloat32_t v1026 = svcmla_f32_x(pred_full, v723, v1603, v783, 90); + svfloat32_t v1113 = svsub_f32_x(svptrue_b32(), v684, v760); + svfloat32_t v1114 = svadd_f32_x(svptrue_b32(), v760, v836); + svst1_f64(pred_full, (double *)(v1614), svreinterpret_f64_f32(v414)); + svst1_f64(pred_full, (double *)(v1794), svreinterpret_f64_f32(v415)); + svfloat32_t v470 = svadd_f32_x(svptrue_b32(), v466, v468); + svfloat32_t v471 = svsub_f32_x(svptrue_b32(), v466, v468); + svfloat32_t v472 = svadd_f32_x(svptrue_b32(), v467, v469); + svfloat32_t v473 = svsub_f32_x(svptrue_b32(), v467, v469); + svfloat32_t v847 = svmla_f32_x(pred_full, v846, v558, v1579); + svfloat32_t v848 = svmls_f32_x(pred_full, v846, v558, v1579); + svfloat32_t v893 = svsub_f32_x(svptrue_b32(), v691, v767); + svfloat32_t v894 = svadd_f32_x(svptrue_b32(), v767, v843); + svfloat32_t v934 = svadd_f32_x(svptrue_b32(), v465, v537); + svfloat32_t v981 = svsub_f32_x(svptrue_b32(), v692, v768); + svfloat32_t v982 = svadd_f32_x(svptrue_b32(), v768, v844); + svfloat32_t v1023 = svmla_f32_x(pred_full, v1022, v559, v1579); + svfloat32_t v1024 = svmls_f32_x(pred_full, v1022, v559, v1579); + svfloat32_t v1069 = svsub_f32_x(svptrue_b32(), v693, v769); + svfloat32_t v1070 = svadd_f32_x(svptrue_b32(), v769, v845); + svfloat32_t v1110 = svadd_f32_x(svptrue_b32(), v464, v536); + svfloat32_t v1157 = svsub_f32_x(svptrue_b32(), v690, v766); + svfloat32_t v1158 = svadd_f32_x(svptrue_b32(), v766, v842); + svst1_f64(pred_full, (double *)(v1704), svreinterpret_f64_f32(v465)); + svst1_f64(pred_full, (double *)(v1884), svreinterpret_f64_f32(v464)); + svfloat32_t v851 = svadd_f32_x(svptrue_b32(), v847, v849); + svfloat32_t v852 = svsub_f32_x(svptrue_b32(), v847, v849); + svfloat32_t v853 = svadd_f32_x(svptrue_b32(), v848, v850); + svfloat32_t v854 = svsub_f32_x(svptrue_b32(), v848, v850); + svfloat32_t v890 = svadd_f32_x(svptrue_b32(), v471, v543); + svfloat32_t v935 = svadd_f32_x(svptrue_b32(), v934, v609); + svfloat32_t v936 = svsub_f32_x(svptrue_b32(), v934, v609); + svfloat32_t v978 = svadd_f32_x(svptrue_b32(), v472, v544); + svfloat32_t v1027 = svadd_f32_x(svptrue_b32(), v1023, v1025); + svfloat32_t v1028 = svsub_f32_x(svptrue_b32(), v1023, v1025); + svfloat32_t v1029 = svadd_f32_x(svptrue_b32(), v1024, v1026); + svfloat32_t v1030 = svsub_f32_x(svptrue_b32(), v1024, v1026); + svfloat32_t v1066 = svadd_f32_x(svptrue_b32(), v473, v545); + svfloat32_t v1111 = svadd_f32_x(svptrue_b32(), v1110, v608); + svfloat32_t v1112 = svsub_f32_x(svptrue_b32(), v1110, v608); + svfloat32_t v1154 = svadd_f32_x(svptrue_b32(), v470, v542); + svst1_f64(pred_full, (double *)(v1659), svreinterpret_f64_f32(v471)); + svst1_f64(pred_full, (double *)(v1749), svreinterpret_f64_f32(v472)); + svst1_f64(pred_full, (double *)(v1839), svreinterpret_f64_f32(v473)); + svst1_f64(pred_full, (double *)(v1929), svreinterpret_f64_f32(v470)); + svfloat32_t v891 = svadd_f32_x(svptrue_b32(), v890, v615); + svfloat32_t v892 = svsub_f32_x(svptrue_b32(), v890, v615); + svfloat32_t v939 = svadd_f32_x(svptrue_b32(), v935, v937); + svfloat32_t v940 = svsub_f32_x(svptrue_b32(), v935, v937); + svfloat32_t v941 = svadd_f32_x(svptrue_b32(), v936, v938); + svfloat32_t v942 = svsub_f32_x(svptrue_b32(), v936, v938); + svfloat32_t v979 = svadd_f32_x(svptrue_b32(), v978, v616); + svfloat32_t v980 = svsub_f32_x(svptrue_b32(), v978, v616); + svfloat32_t v1067 = svadd_f32_x(svptrue_b32(), v1066, v617); + svfloat32_t v1068 = svsub_f32_x(svptrue_b32(), v1066, v617); + svfloat32_t v1115 = svadd_f32_x(svptrue_b32(), v1111, v1113); + svfloat32_t v1116 = svsub_f32_x(svptrue_b32(), v1111, v1113); + svfloat32_t v1117 = svadd_f32_x(svptrue_b32(), v1112, v1114); + svfloat32_t v1118 = svsub_f32_x(svptrue_b32(), v1112, v1114); + svfloat32_t v1155 = svadd_f32_x(svptrue_b32(), v1154, v614); + svfloat32_t v1156 = svsub_f32_x(svptrue_b32(), v1154, v614); + svst1_f64(pred_full, (double *)(v1623), svreinterpret_f64_f32(v852)); + svst1_f64(pred_full, (double *)(v1632), svreinterpret_f64_f32(v854)); + svst1_f64(pred_full, (double *)(v1641), svreinterpret_f64_f32(v853)); + svst1_f64(pred_full, (double *)(v1650), svreinterpret_f64_f32(v851)); + svst1_f64(pred_full, (double *)(v1803), svreinterpret_f64_f32(v1028)); + svst1_f64(pred_full, (double *)(v1812), svreinterpret_f64_f32(v1030)); + svst1_f64(pred_full, (double *)(v1821), svreinterpret_f64_f32(v1029)); + svst1_f64(pred_full, (double *)(v1830), svreinterpret_f64_f32(v1027)); + svfloat32_t v895 = svadd_f32_x(svptrue_b32(), v891, v893); + svfloat32_t v896 = svsub_f32_x(svptrue_b32(), v891, v893); + svfloat32_t v897 = svadd_f32_x(svptrue_b32(), v892, v894); + svfloat32_t v898 = svsub_f32_x(svptrue_b32(), v892, v894); + svfloat32_t v983 = svadd_f32_x(svptrue_b32(), v979, v981); + svfloat32_t v984 = svsub_f32_x(svptrue_b32(), v979, v981); + svfloat32_t v985 = svadd_f32_x(svptrue_b32(), v980, v982); + svfloat32_t v986 = svsub_f32_x(svptrue_b32(), v980, v982); + svfloat32_t v1071 = svadd_f32_x(svptrue_b32(), v1067, v1069); + svfloat32_t v1072 = svsub_f32_x(svptrue_b32(), v1067, v1069); + svfloat32_t v1073 = svadd_f32_x(svptrue_b32(), v1068, v1070); + svfloat32_t v1074 = svsub_f32_x(svptrue_b32(), v1068, v1070); + svfloat32_t v1159 = svadd_f32_x(svptrue_b32(), v1155, v1157); + svfloat32_t v1160 = svsub_f32_x(svptrue_b32(), v1155, v1157); + svfloat32_t v1161 = svadd_f32_x(svptrue_b32(), v1156, v1158); + svfloat32_t v1162 = svsub_f32_x(svptrue_b32(), v1156, v1158); + svst1_f64(pred_full, (double *)(v1713), svreinterpret_f64_f32(v940)); + svst1_f64(pred_full, (double *)(v1722), svreinterpret_f64_f32(v942)); + svst1_f64(pred_full, (double *)(v1731), svreinterpret_f64_f32(v941)); + svst1_f64(pred_full, (double *)(v1740), svreinterpret_f64_f32(v939)); + svst1_f64(pred_full, (double *)(v1893), svreinterpret_f64_f32(v1116)); + svst1_f64(pred_full, (double *)(v1902), svreinterpret_f64_f32(v1118)); + svst1_f64(pred_full, (double *)(v1911), svreinterpret_f64_f32(v1117)); + svst1_f64(pred_full, (double *)(v1920), svreinterpret_f64_f32(v1115)); + svst1_f64(pred_full, (double *)(v1668), svreinterpret_f64_f32(v896)); + svst1_f64(pred_full, (double *)(v1677), svreinterpret_f64_f32(v898)); + svst1_f64(pred_full, (double *)(v1686), svreinterpret_f64_f32(v897)); + svst1_f64(pred_full, (double *)(v1695), svreinterpret_f64_f32(v895)); + svst1_f64(pred_full, (double *)(v1758), svreinterpret_f64_f32(v984)); + svst1_f64(pred_full, (double *)(v1767), svreinterpret_f64_f32(v986)); + svst1_f64(pred_full, (double *)(v1776), svreinterpret_f64_f32(v985)); + svst1_f64(pred_full, (double *)(v1785), svreinterpret_f64_f32(v983)); + svst1_f64(pred_full, (double *)(v1848), svreinterpret_f64_f32(v1072)); + svst1_f64(pred_full, (double *)(v1857), svreinterpret_f64_f32(v1074)); + svst1_f64(pred_full, (double *)(v1866), svreinterpret_f64_f32(v1073)); + svst1_f64(pred_full, (double *)(v1875), svreinterpret_f64_f32(v1071)); + svst1_f64(pred_full, (double *)(v1938), svreinterpret_f64_f32(v1160)); + svst1_f64(pred_full, (double *)(v1947), svreinterpret_f64_f32(v1162)); + svst1_f64(pred_full, (double *)(v1956), svreinterpret_f64_f32(v1161)); + svst1_f64(pred_full, (double *)(v1965), svreinterpret_f64_f32(v1159)); + v5 += v11; + v6 += v12; + } +} +#endif diff --git a/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_gu.h b/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_gu.h new file mode 100644 index 0000000000000000000000000000000000000000..5a95b0fc3b79a2eba3cca4acce567f5d8964ee9b --- /dev/null +++ b/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_gu.h @@ -0,0 +1,44 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#pragma once + +#include "armral.h" +#include "fft_helper.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void(cs16_cf32_cf32_ac_n_gu_fft_t)(const armral_cmplx_int16_t *x, + armral_cmplx_f32_t *y, int istride, + int ostride, int howmany, int idist, + float dir); + +cs16_cf32_cf32_ac_n_gu_fft_t armral_fft_cs16_cf32_cf32_ac_n_gu7; +cs16_cf32_cf32_ac_n_gu_fft_t armral_fft_cs16_cf32_cf32_ac_n_gu9; +cs16_cf32_cf32_ac_n_gu_fft_t armral_fft_cs16_cf32_cf32_ac_n_gu11; +cs16_cf32_cf32_ac_n_gu_fft_t armral_fft_cs16_cf32_cf32_ac_n_gu13; +cs16_cf32_cf32_ac_n_gu_fft_t armral_fft_cs16_cf32_cf32_ac_n_gu14; +cs16_cf32_cf32_ac_n_gu_fft_t armral_fft_cs16_cf32_cf32_ac_n_gu15; +cs16_cf32_cf32_ac_n_gu_fft_t armral_fft_cs16_cf32_cf32_ac_n_gu16; +cs16_cf32_cf32_ac_n_gu_fft_t armral_fft_cs16_cf32_cf32_ac_n_gu17; +cs16_cf32_cf32_ac_n_gu_fft_t armral_fft_cs16_cf32_cf32_ac_n_gu18; +cs16_cf32_cf32_ac_n_gu_fft_t armral_fft_cs16_cf32_cf32_ac_n_gu19; +cs16_cf32_cf32_ac_n_gu_fft_t armral_fft_cs16_cf32_cf32_ac_n_gu20; +cs16_cf32_cf32_ac_n_gu_fft_t armral_fft_cs16_cf32_cf32_ac_n_gu21; +cs16_cf32_cf32_ac_n_gu_fft_t armral_fft_cs16_cf32_cf32_ac_n_gu22; +cs16_cf32_cf32_ac_n_gu_fft_t armral_fft_cs16_cf32_cf32_ac_n_gu24; +cs16_cf32_cf32_ac_n_gu_fft_t armral_fft_cs16_cf32_cf32_ac_n_gu25; +cs16_cf32_cf32_ac_n_gu_fft_t armral_fft_cs16_cf32_cf32_ac_n_gu28; +cs16_cf32_cf32_ac_n_gu_fft_t armral_fft_cs16_cf32_cf32_ac_n_gu30; +cs16_cf32_cf32_ac_n_gu_fft_t armral_fft_cs16_cf32_cf32_ac_n_gu32; +cs16_cf32_cf32_ac_n_gu_fft_t armral_fft_cs16_cf32_cf32_ac_n_gu36; +cs16_cf32_cf32_ac_n_gu_fft_t armral_fft_cs16_cf32_cf32_ac_n_gu40; + +#ifdef __cplusplus +} // extern "C" +#endif \ No newline at end of file diff --git a/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c b/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c new file mode 100644 index 0000000000000000000000000000000000000000..d1d22ad32c0d0364c32d43d4f4ed95504d4250bd --- /dev/null +++ b/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c @@ -0,0 +1,14575 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#include "fft_cs16_cf32_cs16_ac_n_uu.h" + +#include +#ifdef ARMRAL_ARCH_SVE +#include +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu2(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v26 = vld1s_s16(&v5[istride]); + int16x4_t v20 = vld1s_s16(&v5[0]); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + int16x4_t v40 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v28, 15), (int32x2_t){0, 0})); + int16x4_t v46 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v29, 15), (int32x2_t){0, 0})); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v40), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v46), 0); + v5 += 1 * 1; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu2(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * 1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + const int32_t *v78 = &v5[v0]; + int32_t *v99 = &v6[v2]; + const int32_t *v69 = &v5[0]; + int32_t *v90 = &v6[0]; + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v78[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v69[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svint16_t v48 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v34, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v56 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v35, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v90), svreinterpret_u64_s16(v48)); + svst1w_u64(pred_full, (unsigned *)(v99), svreinterpret_u64_s16(v56)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu3(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v20 = vld1s_s16(&v5[istride]); + float v42 = -1.4999999999999998e+00F; + float v45 = 8.6602540378443871e-01F; + float v46 = -8.6602540378443871e-01F; + float32x2_t v48 = (float32x2_t){v4, v4}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v34 = vld1s_s16(&v5[0]); + float32x2_t v43 = (float32x2_t){v42, v42}; + float32x2_t v47 = (float32x2_t){v45, v46}; + int16x4_t v26 = vld1s_s16(&v5[istride * 2]); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v49 = vmul_f32(v48, v47); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v36 = vadd_f32(v28, v35); + float32x2_t v44 = vmul_f32(v28, v43); + float32x2_t v50 = vrev64_f32(v29); + float32x2_t v51 = vmul_f32(v50, v49); + float32x2_t v52 = vadd_f32(v36, v44); + int16x4_t v57 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v36, 15), (int32x2_t){0, 0})); + float32x2_t v53 = vadd_f32(v52, v51); + float32x2_t v54 = vsub_f32(v52, v51); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v57), 0); + int16x4_t v63 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v54, 15), (int32x2_t){0, 0})); + int16x4_t v69 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v53, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v63), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v69), 0); + v5 += 1 * 1; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu3(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * 1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v51 = -1.4999999999999998e+00F; + float v56 = -8.6602540378443871e-01F; + const int32_t *v95 = &v5[v0]; + int32_t *v136 = &v6[v2]; + int64_t v27 = v0 * 2; + float v59 = v4 * v56; + int64_t v82 = v2 * 2; + const int32_t *v114 = &v5[0]; + svfloat32_t v118 = svdup_n_f32(v51); + int32_t *v127 = &v6[0]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v95[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v104 = &v5[v27]; + svfloat32_t v119 = svdup_n_f32(v59); + int32_t *v145 = &v6[v82]; + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v114[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v104[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v44 = svadd_f32_x(svptrue_b32(), v34, v43); + svfloat32_t zero61 = svdup_n_f32(0); + svfloat32_t v61 = svcmla_f32_x(pred_full, zero61, v119, v35, 90); + svfloat32_t v62 = svmla_f32_x(pred_full, v44, v34, v118); + svint16_t v67 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v44, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v63 = svadd_f32_x(svptrue_b32(), v62, v61); + svfloat32_t v64 = svsub_f32_x(svptrue_b32(), v62, v61); + svst1w_u64(pred_full, (unsigned *)(v127), svreinterpret_u64_s16(v67)); + svint16_t v75 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v64, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v83 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v63, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v136), svreinterpret_u64_s16(v75)); + svst1w_u64(pred_full, (unsigned *)(v145), svreinterpret_u64_s16(v83)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu4(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v34 = vld1s_s16(&v5[istride]); + float v58 = 1.0000000000000000e+00F; + float v59 = -1.0000000000000000e+00F; + float32x2_t v61 = (float32x2_t){v4, v4}; + int16x4_t v20 = vld1s_s16(&v5[0]); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v60 = (float32x2_t){v58, v59}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v26 = vld1s_s16(&v5[istride * 2]); + int16x4_t v40 = vld1s_s16(&v5[istride * 3]); + float32x2_t v62 = vmul_f32(v61, v60); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v44 = vadd_f32(v28, v42); + float32x2_t v45 = vsub_f32(v28, v42); + float32x2_t v63 = vrev64_f32(v43); + float32x2_t v64 = vmul_f32(v63, v62); + int16x4_t v69 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v44, 15), (int32x2_t){0, 0})); + int16x4_t v81 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v45, 15), (int32x2_t){0, 0})); + float32x2_t v65 = vadd_f32(v29, v64); + float32x2_t v66 = vsub_f32(v29, v64); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v69), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v81), 0); + int16x4_t v75 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v66, 15), (int32x2_t){0, 0})); + int16x4_t v87 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v65, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v75), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v87), 0); + v5 += 1 * 1; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu4(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * 1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v72 = -1.0000000000000000e+00F; + const int32_t *v137 = &v5[v0]; + int32_t *v169 = &v6[v2]; + int64_t v27 = v0 * 2; + int64_t v45 = v0 * 3; + float v75 = v4 * v72; + int64_t v97 = v2 * 2; + int64_t v105 = v2 * 3; + const int32_t *v119 = &v5[0]; + int32_t *v160 = &v6[0]; + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v137[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v128 = &v5[v27]; + const int32_t *v146 = &v5[v45]; + svfloat32_t v152 = svdup_n_f32(v75); + int32_t *v178 = &v6[v97]; + int32_t *v187 = &v6[v105]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v119[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v128[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v51 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v146[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v54 = svadd_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v55 = svsub_f32_x(svptrue_b32(), v34, v52); + svfloat32_t zero77 = svdup_n_f32(0); + svfloat32_t v77 = svcmla_f32_x(pred_full, zero77, v152, v53, 90); + svfloat32_t v78 = svadd_f32_x(svptrue_b32(), v35, v77); + svfloat32_t v79 = svsub_f32_x(svptrue_b32(), v35, v77); + svint16_t v82 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v54, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v98 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v55, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v90 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v79, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v106 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v78, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v160), svreinterpret_u64_s16(v82)); + svst1w_u64(pred_full, (unsigned *)(v178), svreinterpret_u64_s16(v98)); + svst1w_u64(pred_full, (unsigned *)(v169), svreinterpret_u64_s16(v90)); + svst1w_u64(pred_full, (unsigned *)(v187), svreinterpret_u64_s16(v106)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu5(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v20 = vld1s_s16(&v5[istride]); + float v59 = -1.2500000000000000e+00F; + float v63 = 5.5901699437494745e-01F; + float v66 = 1.5388417685876268e+00F; + float v67 = -1.5388417685876268e+00F; + float v73 = 5.8778525229247325e-01F; + float v74 = -5.8778525229247325e-01F; + float v80 = 3.6327126400268028e-01F; + float v81 = -3.6327126400268028e-01F; + float32x2_t v83 = (float32x2_t){v4, v4}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v51 = vld1s_s16(&v5[0]); + float32x2_t v60 = (float32x2_t){v59, v59}; + float32x2_t v64 = (float32x2_t){v63, v63}; + float32x2_t v68 = (float32x2_t){v66, v67}; + float32x2_t v75 = (float32x2_t){v73, v74}; + float32x2_t v82 = (float32x2_t){v80, v81}; + int16x4_t v26 = vld1s_s16(&v5[istride * 4]); + int16x4_t v34 = vld1s_s16(&v5[istride * 3]); + int16x4_t v40 = vld1s_s16(&v5[istride * 2]); + float32x2_t v52 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v51)), 15); + float32x2_t v70 = vmul_f32(v83, v68); + float32x2_t v77 = vmul_f32(v83, v75); + float32x2_t v84 = vmul_f32(v83, v82); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v44 = vadd_f32(v28, v42); + float32x2_t v45 = vsub_f32(v28, v42); + float32x2_t v46 = vadd_f32(v29, v43); + float32x2_t v71 = vrev64_f32(v29); + float32x2_t v85 = vrev64_f32(v43); + float32x2_t v53 = vadd_f32(v44, v52); + float32x2_t v61 = vmul_f32(v44, v60); + float32x2_t v65 = vmul_f32(v45, v64); + float32x2_t v72 = vmul_f32(v71, v70); + float32x2_t v78 = vrev64_f32(v46); + float32x2_t v86 = vmul_f32(v85, v84); + float32x2_t v79 = vmul_f32(v78, v77); + float32x2_t v87 = vadd_f32(v53, v61); + int16x4_t v98 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v53, 15), (int32x2_t){0, 0})); + float32x2_t v88 = vadd_f32(v87, v65); + float32x2_t v89 = vsub_f32(v87, v65); + float32x2_t v90 = vsub_f32(v72, v79); + float32x2_t v91 = vadd_f32(v79, v86); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v98), 0); + float32x2_t v92 = vadd_f32(v88, v90); + float32x2_t v93 = vsub_f32(v88, v90); + float32x2_t v94 = vadd_f32(v89, v91); + float32x2_t v95 = vsub_f32(v89, v91); + int16x4_t v104 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v93, 15), (int32x2_t){0, 0})); + int16x4_t v110 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v95, 15), (int32x2_t){0, 0})); + int16x4_t v116 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v94, 15), (int32x2_t){0, 0})); + int16x4_t v122 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v92, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v104), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v110), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v116), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v122), 0); + v5 += 1 * 1; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu5(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * 1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v72 = -1.2500000000000000e+00F; + float v77 = 5.5901699437494745e-01F; + float v82 = -1.5388417685876268e+00F; + float v89 = -5.8778525229247325e-01F; + float v96 = -3.6327126400268028e-01F; + const int32_t *v157 = &v5[v0]; + int32_t *v219 = &v6[v2]; + int64_t v27 = v0 * 4; + int64_t v37 = v0 * 3; + int64_t v45 = v0 * 2; + float v85 = v4 * v82; + float v92 = v4 * v89; + float v99 = v4 * v96; + int64_t v128 = v2 * 2; + int64_t v136 = v2 * 3; + int64_t v144 = v2 * 4; + const int32_t *v194 = &v5[0]; + svfloat32_t v198 = svdup_n_f32(v72); + svfloat32_t v199 = svdup_n_f32(v77); + int32_t *v210 = &v6[0]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v157[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v166 = &v5[v27]; + const int32_t *v175 = &v5[v37]; + const int32_t *v184 = &v5[v45]; + svfloat32_t v200 = svdup_n_f32(v85); + svfloat32_t v201 = svdup_n_f32(v92); + svfloat32_t v202 = svdup_n_f32(v99); + int32_t *v228 = &v6[v128]; + int32_t *v237 = &v6[v136]; + int32_t *v246 = &v6[v144]; + svfloat32_t v64 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v194[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v166[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v175[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v51 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v184[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v54 = svadd_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v55 = svsub_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v56 = svadd_f32_x(svptrue_b32(), v35, v53); + svfloat32_t zero87 = svdup_n_f32(0); + svfloat32_t v87 = svcmla_f32_x(pred_full, zero87, v200, v35, 90); + svfloat32_t v65 = svadd_f32_x(svptrue_b32(), v54, v64); + svfloat32_t zero94 = svdup_n_f32(0); + svfloat32_t v94 = svcmla_f32_x(pred_full, zero94, v201, v56, 90); + svfloat32_t v102 = svmla_f32_x(pred_full, v65, v54, v198); + svfloat32_t v105 = svsub_f32_x(svptrue_b32(), v87, v94); + svfloat32_t v106 = svcmla_f32_x(pred_full, v94, v202, v53, 90); + svint16_t v113 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v65, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v103 = svmla_f32_x(pred_full, v102, v55, v199); + svfloat32_t v104 = svmls_f32_x(pred_full, v102, v55, v199); + svst1w_u64(pred_full, (unsigned *)(v210), svreinterpret_u64_s16(v113)); + svfloat32_t v107 = svadd_f32_x(svptrue_b32(), v103, v105); + svfloat32_t v108 = svsub_f32_x(svptrue_b32(), v103, v105); + svfloat32_t v109 = svadd_f32_x(svptrue_b32(), v104, v106); + svfloat32_t v110 = svsub_f32_x(svptrue_b32(), v104, v106); + svint16_t v121 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v108, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v129 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v110, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v137 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v109, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v145 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v107, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v219), svreinterpret_u64_s16(v121)); + svst1w_u64(pred_full, (unsigned *)(v228), svreinterpret_u64_s16(v129)); + svst1w_u64(pred_full, (unsigned *)(v237), svreinterpret_u64_s16(v137)); + svst1w_u64(pred_full, (unsigned *)(v246), svreinterpret_u64_s16(v145)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu6(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v54 = vld1s_s16(&v5[istride]); + float v87 = -1.4999999999999998e+00F; + float v90 = 8.6602540378443871e-01F; + float v91 = -8.6602540378443871e-01F; + float32x2_t v93 = (float32x2_t){v4, v4}; + int16x4_t v20 = vld1s_s16(&v5[0]); + float32x2_t v55 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v54)), 15); + float32x2_t v88 = (float32x2_t){v87, v87}; + float32x2_t v92 = (float32x2_t){v90, v91}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v26 = vld1s_s16(&v5[istride * 3]); + int16x4_t v34 = vld1s_s16(&v5[istride * 2]); + int16x4_t v40 = vld1s_s16(&v5[istride * 5]); + int16x4_t v48 = vld1s_s16(&v5[istride * 4]); + float32x2_t v94 = vmul_f32(v93, v92); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v49 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v48)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v56 = vadd_f32(v49, v55); + float32x2_t v57 = vsub_f32(v49, v55); + float32x2_t v58 = vadd_f32(v42, v56); + float32x2_t v59 = vsub_f32(v42, v56); + float32x2_t v79 = vadd_f32(v43, v57); + float32x2_t v80 = vsub_f32(v43, v57); + float32x2_t v60 = vadd_f32(v58, v28); + float32x2_t v68 = vmul_f32(v58, v88); + float32x2_t v74 = vrev64_f32(v59); + float32x2_t v81 = vadd_f32(v79, v29); + float32x2_t v89 = vmul_f32(v79, v88); + float32x2_t v95 = vrev64_f32(v80); + float32x2_t v75 = vmul_f32(v74, v94); + float32x2_t v76 = vadd_f32(v60, v68); + float32x2_t v96 = vmul_f32(v95, v94); + float32x2_t v97 = vadd_f32(v81, v89); + int16x4_t v102 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v60, 15), (int32x2_t){0, 0})); + int16x4_t v108 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v81, 15), (int32x2_t){0, 0})); + float32x2_t v77 = vadd_f32(v76, v75); + float32x2_t v78 = vsub_f32(v76, v75); + float32x2_t v98 = vadd_f32(v97, v96); + float32x2_t v99 = vsub_f32(v97, v96); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v102), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v108), 0); + int16x4_t v114 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v78, 15), (int32x2_t){0, 0})); + int16x4_t v120 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v99, 15), (int32x2_t){0, 0})); + int16x4_t v126 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v77, 15), (int32x2_t){0, 0})); + int16x4_t v132 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v98, 15), (int32x2_t){0, 0})); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v114), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v120), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v126), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v132), 0); + v5 += 1 * 1; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu6(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * 1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v104 = -1.4999999999999998e+00F; + float v109 = -8.6602540378443871e-01F; + const int32_t *v218 = &v5[v0]; + int32_t *v261 = &v6[v2]; + int64_t v27 = v0 * 3; + int64_t v37 = v0 * 2; + int64_t v45 = v0 * 5; + int64_t v55 = v0 * 4; + float v112 = v4 * v109; + int64_t v127 = v2 * 3; + int64_t v135 = v2 * 4; + int64_t v151 = v2 * 2; + int64_t v159 = v2 * 5; + const int32_t *v173 = &v5[0]; + svfloat32_t v225 = svdup_n_f32(v104); + int32_t *v234 = &v6[0]; + svfloat32_t v69 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v218[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v182 = &v5[v27]; + const int32_t *v191 = &v5[v37]; + const int32_t *v200 = &v5[v45]; + const int32_t *v209 = &v5[v55]; + svfloat32_t v226 = svdup_n_f32(v112); + int32_t *v243 = &v6[v127]; + int32_t *v252 = &v6[v135]; + int32_t *v270 = &v6[v151]; + int32_t *v279 = &v6[v159]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v173[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v182[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v191[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v51 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v200[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v61 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v209[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v70 = svadd_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v71 = svsub_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v72 = svadd_f32_x(svptrue_b32(), v52, v70); + svfloat32_t v73 = svsub_f32_x(svptrue_b32(), v52, v70); + svfloat32_t v95 = svadd_f32_x(svptrue_b32(), v53, v71); + svfloat32_t v96 = svsub_f32_x(svptrue_b32(), v53, v71); + svfloat32_t v74 = svadd_f32_x(svptrue_b32(), v72, v34); + svfloat32_t zero91 = svdup_n_f32(0); + svfloat32_t v91 = svcmla_f32_x(pred_full, zero91, v226, v73, 90); + svfloat32_t v97 = svadd_f32_x(svptrue_b32(), v95, v35); + svfloat32_t zero114 = svdup_n_f32(0); + svfloat32_t v114 = svcmla_f32_x(pred_full, zero114, v226, v96, 90); + svfloat32_t v92 = svmla_f32_x(pred_full, v74, v72, v225); + svfloat32_t v115 = svmla_f32_x(pred_full, v97, v95, v225); + svint16_t v120 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v74, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v128 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v97, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v93 = svadd_f32_x(svptrue_b32(), v92, v91); + svfloat32_t v94 = svsub_f32_x(svptrue_b32(), v92, v91); + svfloat32_t v116 = svadd_f32_x(svptrue_b32(), v115, v114); + svfloat32_t v117 = svsub_f32_x(svptrue_b32(), v115, v114); + svst1w_u64(pred_full, (unsigned *)(v234), svreinterpret_u64_s16(v120)); + svst1w_u64(pred_full, (unsigned *)(v243), svreinterpret_u64_s16(v128)); + svint16_t v136 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v94, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v144 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v117, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v152 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v93, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v160 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v116, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v252), svreinterpret_u64_s16(v136)); + svst1w_u64(pred_full, (unsigned *)(v261), svreinterpret_u64_s16(v144)); + svst1w_u64(pred_full, (unsigned *)(v270), svreinterpret_u64_s16(v152)); + svst1w_u64(pred_full, (unsigned *)(v279), svreinterpret_u64_s16(v160)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu7(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v20 = vld1s_s16(&v5[istride]); + float v80 = -1.1666666666666665e+00F; + float v84 = 7.9015646852540022e-01F; + float v88 = 5.5854267289647742e-02F; + float v92 = 7.3430220123575241e-01F; + float v95 = 4.4095855184409838e-01F; + float v96 = -4.4095855184409838e-01F; + float v102 = 3.4087293062393137e-01F; + float v103 = -3.4087293062393137e-01F; + float v109 = -5.3396936033772524e-01F; + float v110 = 5.3396936033772524e-01F; + float v116 = 8.7484229096165667e-01F; + float v117 = -8.7484229096165667e-01F; + float32x2_t v119 = (float32x2_t){v4, v4}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v64 = vld1s_s16(&v5[0]); + float32x2_t v81 = (float32x2_t){v80, v80}; + float32x2_t v85 = (float32x2_t){v84, v84}; + float32x2_t v89 = (float32x2_t){v88, v88}; + float32x2_t v93 = (float32x2_t){v92, v92}; + float32x2_t v97 = (float32x2_t){v95, v96}; + float32x2_t v104 = (float32x2_t){v102, v103}; + float32x2_t v111 = (float32x2_t){v109, v110}; + float32x2_t v118 = (float32x2_t){v116, v117}; + int16x4_t v26 = vld1s_s16(&v5[istride * 6]); + int16x4_t v34 = vld1s_s16(&v5[istride * 4]); + int16x4_t v40 = vld1s_s16(&v5[istride * 3]); + int16x4_t v48 = vld1s_s16(&v5[istride * 2]); + int16x4_t v54 = vld1s_s16(&v5[istride * 5]); + float32x2_t v65 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v64)), 15); + float32x2_t v99 = vmul_f32(v119, v97); + float32x2_t v106 = vmul_f32(v119, v104); + float32x2_t v113 = vmul_f32(v119, v111); + float32x2_t v120 = vmul_f32(v119, v118); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v49 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v48)), 15); + float32x2_t v55 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v54)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v56 = vadd_f32(v49, v55); + float32x2_t v57 = vsub_f32(v49, v55); + float32x2_t v58 = vadd_f32(v28, v42); + float32x2_t v67 = vsub_f32(v28, v42); + float32x2_t v68 = vsub_f32(v42, v56); + float32x2_t v69 = vsub_f32(v56, v28); + float32x2_t v70 = vadd_f32(v29, v43); + float32x2_t v72 = vsub_f32(v29, v43); + float32x2_t v73 = vsub_f32(v43, v57); + float32x2_t v74 = vsub_f32(v57, v29); + float32x2_t v59 = vadd_f32(v58, v56); + float32x2_t v71 = vadd_f32(v70, v57); + float32x2_t v86 = vmul_f32(v67, v85); + float32x2_t v90 = vmul_f32(v68, v89); + float32x2_t v94 = vmul_f32(v69, v93); + float32x2_t v107 = vrev64_f32(v72); + float32x2_t v114 = vrev64_f32(v73); + float32x2_t v121 = vrev64_f32(v74); + float32x2_t v66 = vadd_f32(v59, v65); + float32x2_t v82 = vmul_f32(v59, v81); + float32x2_t v100 = vrev64_f32(v71); + float32x2_t v108 = vmul_f32(v107, v106); + float32x2_t v115 = vmul_f32(v114, v113); + float32x2_t v122 = vmul_f32(v121, v120); + float32x2_t v101 = vmul_f32(v100, v99); + float32x2_t v123 = vadd_f32(v66, v82); + int16x4_t v144 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v66, 15), (int32x2_t){0, 0})); + float32x2_t v124 = vadd_f32(v123, v86); + float32x2_t v126 = vsub_f32(v123, v86); + float32x2_t v128 = vsub_f32(v123, v90); + float32x2_t v130 = vadd_f32(v101, v108); + float32x2_t v132 = vsub_f32(v101, v108); + float32x2_t v134 = vsub_f32(v101, v115); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v144), 0); + float32x2_t v125 = vadd_f32(v124, v90); + float32x2_t v127 = vsub_f32(v126, v94); + float32x2_t v129 = vadd_f32(v128, v94); + float32x2_t v131 = vadd_f32(v130, v115); + float32x2_t v133 = vsub_f32(v132, v122); + float32x2_t v135 = vadd_f32(v134, v122); + float32x2_t v136 = vadd_f32(v125, v131); + float32x2_t v137 = vsub_f32(v125, v131); + float32x2_t v138 = vadd_f32(v127, v133); + float32x2_t v139 = vsub_f32(v127, v133); + float32x2_t v140 = vadd_f32(v129, v135); + float32x2_t v141 = vsub_f32(v129, v135); + int16x4_t v150 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v137, 15), (int32x2_t){0, 0})); + int16x4_t v156 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v139, 15), (int32x2_t){0, 0})); + int16x4_t v162 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v140, 15), (int32x2_t){0, 0})); + int16x4_t v168 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v141, 15), (int32x2_t){0, 0})); + int16x4_t v174 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v138, 15), (int32x2_t){0, 0})); + int16x4_t v180 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v136, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v150), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v156), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v162), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v168), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v174), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v180), 0); + v5 += 1 * 1; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu7(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * 1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v97 = -1.1666666666666665e+00F; + float v102 = 7.9015646852540022e-01F; + float v107 = 5.5854267289647742e-02F; + float v112 = 7.3430220123575241e-01F; + float v117 = -4.4095855184409838e-01F; + float v124 = -3.4087293062393137e-01F; + float v131 = 5.3396936033772524e-01F; + float v138 = -8.7484229096165667e-01F; + const int32_t *v225 = &v5[v0]; + int32_t *v308 = &v6[v2]; + int64_t v27 = v0 * 6; + int64_t v37 = v0 * 4; + int64_t v45 = v0 * 3; + int64_t v55 = v0 * 2; + int64_t v63 = v0 * 5; + float v120 = v4 * v117; + float v127 = v4 * v124; + float v134 = v4 * v131; + float v141 = v4 * v138; + int64_t v180 = v2 * 2; + int64_t v188 = v2 * 3; + int64_t v196 = v2 * 4; + int64_t v204 = v2 * 5; + int64_t v212 = v2 * 6; + const int32_t *v280 = &v5[0]; + svfloat32_t v284 = svdup_n_f32(v97); + svfloat32_t v285 = svdup_n_f32(v102); + svfloat32_t v286 = svdup_n_f32(v107); + svfloat32_t v287 = svdup_n_f32(v112); + int32_t *v299 = &v6[0]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v225[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v234 = &v5[v27]; + const int32_t *v243 = &v5[v37]; + const int32_t *v252 = &v5[v45]; + const int32_t *v261 = &v5[v55]; + const int32_t *v270 = &v5[v63]; + svfloat32_t v288 = svdup_n_f32(v120); + svfloat32_t v289 = svdup_n_f32(v127); + svfloat32_t v290 = svdup_n_f32(v134); + svfloat32_t v291 = svdup_n_f32(v141); + int32_t *v317 = &v6[v180]; + int32_t *v326 = &v6[v188]; + int32_t *v335 = &v6[v196]; + int32_t *v344 = &v6[v204]; + int32_t *v353 = &v6[v212]; + svfloat32_t v81 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v280[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v234[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v243[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v51 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v252[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v61 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v261[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v69 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v270[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v70 = svadd_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v71 = svsub_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v72 = svadd_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v83 = svsub_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v84 = svsub_f32_x(svptrue_b32(), v52, v70); + svfloat32_t v85 = svsub_f32_x(svptrue_b32(), v70, v34); + svfloat32_t v86 = svadd_f32_x(svptrue_b32(), v35, v53); + svfloat32_t v88 = svsub_f32_x(svptrue_b32(), v35, v53); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v53, v71); + svfloat32_t v90 = svsub_f32_x(svptrue_b32(), v71, v35); + svfloat32_t v73 = svadd_f32_x(svptrue_b32(), v72, v70); + svfloat32_t v87 = svadd_f32_x(svptrue_b32(), v86, v71); + svfloat32_t zero129 = svdup_n_f32(0); + svfloat32_t v129 = svcmla_f32_x(pred_full, zero129, v289, v88, 90); + svfloat32_t zero136 = svdup_n_f32(0); + svfloat32_t v136 = svcmla_f32_x(pred_full, zero136, v290, v89, 90); + svfloat32_t zero143 = svdup_n_f32(0); + svfloat32_t v143 = svcmla_f32_x(pred_full, zero143, v291, v90, 90); + svfloat32_t v82 = svadd_f32_x(svptrue_b32(), v73, v81); + svfloat32_t zero122 = svdup_n_f32(0); + svfloat32_t v122 = svcmla_f32_x(pred_full, zero122, v288, v87, 90); + svfloat32_t v144 = svmla_f32_x(pred_full, v82, v73, v284); + svfloat32_t v151 = svadd_f32_x(svptrue_b32(), v122, v129); + svfloat32_t v153 = svsub_f32_x(svptrue_b32(), v122, v129); + svfloat32_t v155 = svsub_f32_x(svptrue_b32(), v122, v136); + svint16_t v165 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v82, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v145 = svmla_f32_x(pred_full, v144, v83, v285); + svfloat32_t v147 = svmls_f32_x(pred_full, v144, v83, v285); + svfloat32_t v149 = svmls_f32_x(pred_full, v144, v84, v286); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v151, v136); + svfloat32_t v154 = svsub_f32_x(svptrue_b32(), v153, v143); + svfloat32_t v156 = svadd_f32_x(svptrue_b32(), v155, v143); + svst1w_u64(pred_full, (unsigned *)(v299), svreinterpret_u64_s16(v165)); + svfloat32_t v146 = svmla_f32_x(pred_full, v145, v84, v286); + svfloat32_t v148 = svmls_f32_x(pred_full, v147, v85, v287); + svfloat32_t v150 = svmla_f32_x(pred_full, v149, v85, v287); + svfloat32_t v157 = svadd_f32_x(svptrue_b32(), v146, v152); + svfloat32_t v158 = svsub_f32_x(svptrue_b32(), v146, v152); + svfloat32_t v159 = svadd_f32_x(svptrue_b32(), v148, v154); + svfloat32_t v160 = svsub_f32_x(svptrue_b32(), v148, v154); + svfloat32_t v161 = svadd_f32_x(svptrue_b32(), v150, v156); + svfloat32_t v162 = svsub_f32_x(svptrue_b32(), v150, v156); + svint16_t v173 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v158, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v181 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v160, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v189 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v161, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v197 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v162, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v205 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v159, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v213 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v157, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v308), svreinterpret_u64_s16(v173)); + svst1w_u64(pred_full, (unsigned *)(v317), svreinterpret_u64_s16(v181)); + svst1w_u64(pred_full, (unsigned *)(v326), svreinterpret_u64_s16(v189)); + svst1w_u64(pred_full, (unsigned *)(v335), svreinterpret_u64_s16(v197)); + svst1w_u64(pred_full, (unsigned *)(v344), svreinterpret_u64_s16(v205)); + svst1w_u64(pred_full, (unsigned *)(v353), svreinterpret_u64_s16(v213)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu8(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v48 = vld1s_s16(&v5[istride]); + float v103 = 1.0000000000000000e+00F; + float v104 = -1.0000000000000000e+00F; + float v111 = -7.0710678118654746e-01F; + float32x2_t v113 = (float32x2_t){v4, v4}; + float v118 = 7.0710678118654757e-01F; + int16x4_t v20 = vld1s_s16(&v5[0]); + float32x2_t v49 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v48)), 15); + float32x2_t v105 = (float32x2_t){v103, v104}; + float32x2_t v112 = (float32x2_t){v118, v111}; + float32x2_t v119 = (float32x2_t){v118, v118}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v26 = vld1s_s16(&v5[istride * 4]); + int16x4_t v34 = vld1s_s16(&v5[istride * 2]); + int16x4_t v40 = vld1s_s16(&v5[istride * 6]); + int16x4_t v54 = vld1s_s16(&v5[istride * 5]); + int16x4_t v62 = vld1s_s16(&v5[istride * 3]); + int16x4_t v68 = vld1s_s16(&v5[istride * 7]); + float32x2_t v107 = vmul_f32(v113, v105); + float32x2_t v114 = vmul_f32(v113, v112); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v55 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v54)), 15); + float32x2_t v63 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v62)), 15); + float32x2_t v69 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v68)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v56 = vadd_f32(v49, v55); + float32x2_t v57 = vsub_f32(v49, v55); + float32x2_t v70 = vadd_f32(v63, v69); + float32x2_t v71 = vsub_f32(v63, v69); + float32x2_t v72 = vadd_f32(v28, v42); + float32x2_t v73 = vsub_f32(v28, v42); + float32x2_t v74 = vadd_f32(v56, v70); + float32x2_t v75 = vsub_f32(v56, v70); + float32x2_t v78 = vadd_f32(v57, v71); + float32x2_t v79 = vsub_f32(v57, v71); + float32x2_t v108 = vrev64_f32(v43); + float32x2_t v76 = vadd_f32(v72, v74); + float32x2_t v77 = vsub_f32(v72, v74); + float32x2_t v97 = vrev64_f32(v75); + float32x2_t v109 = vmul_f32(v108, v107); + float32x2_t v115 = vrev64_f32(v78); + float32x2_t v120 = vmul_f32(v79, v119); + float32x2_t v98 = vmul_f32(v97, v107); + float32x2_t v116 = vmul_f32(v115, v114); + float32x2_t v123 = vadd_f32(v29, v120); + float32x2_t v124 = vsub_f32(v29, v120); + int16x4_t v133 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v76, 15), (int32x2_t){0, 0})); + int16x4_t v157 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v77, 15), (int32x2_t){0, 0})); + float32x2_t v121 = vadd_f32(v73, v98); + float32x2_t v122 = vsub_f32(v73, v98); + float32x2_t v125 = vadd_f32(v109, v116); + float32x2_t v126 = vsub_f32(v109, v116); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v133), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v157), 0); + float32x2_t v127 = vadd_f32(v123, v125); + float32x2_t v128 = vsub_f32(v123, v125); + float32x2_t v129 = vadd_f32(v124, v126); + float32x2_t v130 = vsub_f32(v124, v126); + int16x4_t v145 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v122, 15), (int32x2_t){0, 0})); + int16x4_t v169 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v121, 15), (int32x2_t){0, 0})); + int16x4_t v139 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v128, 15), (int32x2_t){0, 0})); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v145), 0); + int16x4_t v151 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v129, 15), (int32x2_t){0, 0})); + int16x4_t v163 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v130, 15), (int32x2_t){0, 0})); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v169), 0); + int16x4_t v175 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v127, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v139), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v151), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v163), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v175), 0); + v5 += 1 * 1; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu8(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * 1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v126 = -1.0000000000000000e+00F; + float v133 = -7.0710678118654746e-01F; + float v140 = 7.0710678118654757e-01F; + const int32_t *v261 = &v5[v0]; + int32_t *v315 = &v6[v2]; + int64_t v27 = v0 * 4; + int64_t v37 = v0 * 2; + int64_t v45 = v0 * 6; + int64_t v63 = v0 * 5; + int64_t v73 = v0 * 3; + int64_t v81 = v0 * 7; + float v129 = v4 * v126; + float v136 = v4 * v133; + int64_t v171 = v2 * 2; + int64_t v179 = v2 * 3; + int64_t v187 = v2 * 4; + int64_t v195 = v2 * 5; + int64_t v203 = v2 * 6; + int64_t v211 = v2 * 7; + const int32_t *v225 = &v5[0]; + svfloat32_t v298 = svdup_n_f32(v140); + int32_t *v306 = &v6[0]; + svfloat32_t v61 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v261[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v234 = &v5[v27]; + const int32_t *v243 = &v5[v37]; + const int32_t *v252 = &v5[v45]; + const int32_t *v270 = &v5[v63]; + const int32_t *v279 = &v5[v73]; + const int32_t *v288 = &v5[v81]; + svfloat32_t v296 = svdup_n_f32(v129); + svfloat32_t v297 = svdup_n_f32(v136); + int32_t *v324 = &v6[v171]; + int32_t *v333 = &v6[v179]; + int32_t *v342 = &v6[v187]; + int32_t *v351 = &v6[v195]; + int32_t *v360 = &v6[v203]; + int32_t *v369 = &v6[v211]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v225[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v234[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v243[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v51 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v252[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v69 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v270[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v79 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v279[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v87 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v288[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v70 = svadd_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v71 = svsub_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v90 = svadd_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v91 = svsub_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v92 = svadd_f32_x(svptrue_b32(), v70, v88); + svfloat32_t v93 = svsub_f32_x(svptrue_b32(), v70, v88); + svfloat32_t v96 = svadd_f32_x(svptrue_b32(), v71, v89); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v71, v89); + svfloat32_t zero131 = svdup_n_f32(0); + svfloat32_t v131 = svcmla_f32_x(pred_full, zero131, v296, v53, 90); + svfloat32_t v94 = svadd_f32_x(svptrue_b32(), v90, v92); + svfloat32_t v95 = svsub_f32_x(svptrue_b32(), v90, v92); + svfloat32_t zero119 = svdup_n_f32(0); + svfloat32_t v119 = svcmla_f32_x(pred_full, zero119, v296, v93, 90); + svfloat32_t zero138 = svdup_n_f32(0); + svfloat32_t v138 = svcmla_f32_x(pred_full, zero138, v297, v96, 90); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v91, v119); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v91, v119); + svfloat32_t v146 = svmla_f32_x(pred_full, v35, v97, v298); + svfloat32_t v147 = svmls_f32_x(pred_full, v35, v97, v298); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v131, v138); + svfloat32_t v149 = svsub_f32_x(svptrue_b32(), v131, v138); + svint16_t v156 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v94, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v188 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v95, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v150 = svadd_f32_x(svptrue_b32(), v146, v148); + svfloat32_t v151 = svsub_f32_x(svptrue_b32(), v146, v148); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v147, v149); + svfloat32_t v153 = svsub_f32_x(svptrue_b32(), v147, v149); + svint16_t v172 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v145, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v204 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v144, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v306), svreinterpret_u64_s16(v156)); + svst1w_u64(pred_full, (unsigned *)(v342), svreinterpret_u64_s16(v188)); + svint16_t v164 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v151, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v180 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v152, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v196 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v153, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v212 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v150, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v324), svreinterpret_u64_s16(v172)); + svst1w_u64(pred_full, (unsigned *)(v360), svreinterpret_u64_s16(v204)); + svst1w_u64(pred_full, (unsigned *)(v315), svreinterpret_u64_s16(v164)); + svst1w_u64(pred_full, (unsigned *)(v333), svreinterpret_u64_s16(v180)); + svst1w_u64(pred_full, (unsigned *)(v351), svreinterpret_u64_s16(v196)); + svst1w_u64(pred_full, (unsigned *)(v369), svreinterpret_u64_s16(v212)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu9(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v20 = vld1s_s16(&v5[istride]); + float v95 = -5.0000000000000000e-01F; + float v106 = -1.4999999999999998e+00F; + float v109 = 8.6602540378443871e-01F; + float v110 = -8.6602540378443871e-01F; + float v117 = 7.6604444311897801e-01F; + float v121 = 9.3969262078590832e-01F; + float v125 = -1.7364817766693039e-01F; + float v128 = 6.4278760968653925e-01F; + float v129 = -6.4278760968653925e-01F; + float v135 = -3.4202014332566888e-01F; + float v136 = 3.4202014332566888e-01F; + float v142 = 9.8480775301220802e-01F; + float v143 = -9.8480775301220802e-01F; + float32x2_t v145 = (float32x2_t){v4, v4}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v79 = vld1s_s16(&v5[0]); + float32x2_t v96 = (float32x2_t){v95, v95}; + float32x2_t v107 = (float32x2_t){v106, v106}; + float32x2_t v111 = (float32x2_t){v109, v110}; + float32x2_t v118 = (float32x2_t){v117, v117}; + float32x2_t v122 = (float32x2_t){v121, v121}; + float32x2_t v126 = (float32x2_t){v125, v125}; + float32x2_t v130 = (float32x2_t){v128, v129}; + float32x2_t v137 = (float32x2_t){v135, v136}; + float32x2_t v144 = (float32x2_t){v142, v143}; + int16x4_t v26 = vld1s_s16(&v5[istride * 8]); + int16x4_t v34 = vld1s_s16(&v5[istride * 7]); + int16x4_t v40 = vld1s_s16(&v5[istride * 2]); + int16x4_t v48 = vld1s_s16(&v5[istride * 3]); + int16x4_t v54 = vld1s_s16(&v5[istride * 6]); + int16x4_t v62 = vld1s_s16(&v5[istride * 4]); + int16x4_t v68 = vld1s_s16(&v5[istride * 5]); + float32x2_t v80 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v79)), 15); + float32x2_t v113 = vmul_f32(v145, v111); + float32x2_t v132 = vmul_f32(v145, v130); + float32x2_t v139 = vmul_f32(v145, v137); + float32x2_t v146 = vmul_f32(v145, v144); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v49 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v48)), 15); + float32x2_t v55 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v54)), 15); + float32x2_t v63 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v62)), 15); + float32x2_t v69 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v68)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v56 = vadd_f32(v49, v55); + float32x2_t v57 = vsub_f32(v49, v55); + float32x2_t v70 = vadd_f32(v63, v69); + float32x2_t v71 = vsub_f32(v63, v69); + float32x2_t v72 = vadd_f32(v28, v42); + float32x2_t v82 = vadd_f32(v29, v43); + float32x2_t v84 = vsub_f32(v28, v42); + float32x2_t v85 = vsub_f32(v42, v70); + float32x2_t v86 = vsub_f32(v70, v28); + float32x2_t v87 = vsub_f32(v29, v43); + float32x2_t v88 = vsub_f32(v43, v71); + float32x2_t v89 = vsub_f32(v71, v29); + float32x2_t v108 = vmul_f32(v56, v107); + float32x2_t v114 = vrev64_f32(v57); + float32x2_t v73 = vadd_f32(v72, v70); + float32x2_t v83 = vadd_f32(v82, v71); + float32x2_t v115 = vmul_f32(v114, v113); + float32x2_t v119 = vmul_f32(v84, v118); + float32x2_t v123 = vmul_f32(v85, v122); + float32x2_t v127 = vmul_f32(v86, v126); + float32x2_t v133 = vrev64_f32(v87); + float32x2_t v140 = vrev64_f32(v88); + float32x2_t v147 = vrev64_f32(v89); + float32x2_t v74 = vadd_f32(v73, v56); + float32x2_t v97 = vmul_f32(v73, v96); + float32x2_t v103 = vrev64_f32(v83); + float32x2_t v134 = vmul_f32(v133, v132); + float32x2_t v141 = vmul_f32(v140, v139); + float32x2_t v148 = vmul_f32(v147, v146); + float32x2_t v81 = vadd_f32(v74, v80); + float32x2_t v104 = vmul_f32(v103, v113); + float32x2_t v149 = vadd_f32(v97, v97); + float32x2_t v162 = vadd_f32(v115, v134); + float32x2_t v164 = vsub_f32(v115, v141); + float32x2_t v166 = vsub_f32(v115, v134); + float32x2_t v150 = vadd_f32(v149, v97); + float32x2_t v154 = vadd_f32(v81, v108); + float32x2_t v163 = vadd_f32(v162, v141); + float32x2_t v165 = vadd_f32(v164, v148); + float32x2_t v167 = vsub_f32(v166, v148); + int16x4_t v176 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v81, 15), (int32x2_t){0, 0})); + float32x2_t v151 = vadd_f32(v81, v150); + float32x2_t v155 = vadd_f32(v154, v149); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v176), 0); + float32x2_t v152 = vadd_f32(v151, v104); + float32x2_t v153 = vsub_f32(v151, v104); + float32x2_t v156 = vadd_f32(v155, v119); + float32x2_t v158 = vsub_f32(v155, v123); + float32x2_t v160 = vsub_f32(v155, v119); + float32x2_t v157 = vadd_f32(v156, v123); + float32x2_t v159 = vadd_f32(v158, v127); + float32x2_t v161 = vsub_f32(v160, v127); + int16x4_t v194 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v153, 15), (int32x2_t){0, 0})); + int16x4_t v212 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v152, 15), (int32x2_t){0, 0})); + float32x2_t v168 = vadd_f32(v157, v163); + float32x2_t v169 = vsub_f32(v157, v163); + float32x2_t v170 = vadd_f32(v159, v165); + float32x2_t v171 = vsub_f32(v159, v165); + float32x2_t v172 = vadd_f32(v161, v167); + float32x2_t v173 = vsub_f32(v161, v167); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v194), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v212), 0); + int16x4_t v182 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v169, 15), (int32x2_t){0, 0})); + int16x4_t v188 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v170, 15), (int32x2_t){0, 0})); + int16x4_t v200 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v173, 15), (int32x2_t){0, 0})); + int16x4_t v206 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v172, 15), (int32x2_t){0, 0})); + int16x4_t v218 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v171, 15), (int32x2_t){0, 0})); + int16x4_t v224 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v168, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v182), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v188), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v200), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v206), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v218), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v224), 0); + v5 += 1 * 1; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu9(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * 1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v116 = -5.0000000000000000e-01F; + float v128 = -1.4999999999999998e+00F; + float v133 = -8.6602540378443871e-01F; + float v140 = 7.6604444311897801e-01F; + float v145 = 9.3969262078590832e-01F; + float v150 = -1.7364817766693039e-01F; + float v155 = -6.4278760968653925e-01F; + float v162 = 3.4202014332566888e-01F; + float v169 = -9.8480775301220802e-01F; + const int32_t *v278 = &v5[v0]; + int32_t *v381 = &v6[v2]; + int64_t v27 = v0 * 8; + int64_t v37 = v0 * 7; + int64_t v45 = v0 * 2; + int64_t v55 = v0 * 3; + int64_t v63 = v0 * 6; + int64_t v73 = v0 * 4; + int64_t v81 = v0 * 5; + float v136 = v4 * v133; + float v158 = v4 * v155; + float v165 = v4 * v162; + float v172 = v4 * v169; + int64_t v217 = v2 * 2; + int64_t v225 = v2 * 3; + int64_t v233 = v2 * 4; + int64_t v241 = v2 * 5; + int64_t v249 = v2 * 6; + int64_t v257 = v2 * 7; + int64_t v265 = v2 * 8; + const int32_t *v351 = &v5[0]; + svfloat32_t v355 = svdup_n_f32(v116); + svfloat32_t v357 = svdup_n_f32(v128); + svfloat32_t v359 = svdup_n_f32(v140); + svfloat32_t v360 = svdup_n_f32(v145); + svfloat32_t v361 = svdup_n_f32(v150); + int32_t *v372 = &v6[0]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v278[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v287 = &v5[v27]; + const int32_t *v296 = &v5[v37]; + const int32_t *v305 = &v5[v45]; + const int32_t *v314 = &v5[v55]; + const int32_t *v323 = &v5[v63]; + const int32_t *v332 = &v5[v73]; + const int32_t *v341 = &v5[v81]; + svfloat32_t v358 = svdup_n_f32(v136); + svfloat32_t v362 = svdup_n_f32(v158); + svfloat32_t v363 = svdup_n_f32(v165); + svfloat32_t v364 = svdup_n_f32(v172); + int32_t *v390 = &v6[v217]; + int32_t *v399 = &v6[v225]; + int32_t *v408 = &v6[v233]; + int32_t *v417 = &v6[v241]; + int32_t *v426 = &v6[v249]; + int32_t *v435 = &v6[v257]; + int32_t *v444 = &v6[v265]; + svfloat32_t v100 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v351[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v287[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v296[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v51 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v305[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v61 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v314[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v69 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v323[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v79 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v332[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v87 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v341[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v70 = svadd_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v71 = svsub_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v90 = svadd_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v35, v53); + svfloat32_t v104 = svsub_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v105 = svsub_f32_x(svptrue_b32(), v52, v88); + svfloat32_t v106 = svsub_f32_x(svptrue_b32(), v88, v34); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v35, v53); + svfloat32_t v108 = svsub_f32_x(svptrue_b32(), v53, v89); + svfloat32_t v109 = svsub_f32_x(svptrue_b32(), v89, v35); + svfloat32_t zero138 = svdup_n_f32(0); + svfloat32_t v138 = svcmla_f32_x(pred_full, zero138, v358, v71, 90); + svfloat32_t v91 = svadd_f32_x(svptrue_b32(), v90, v88); + svfloat32_t v103 = svadd_f32_x(svptrue_b32(), v102, v89); + svfloat32_t zero160 = svdup_n_f32(0); + svfloat32_t v160 = svcmla_f32_x(pred_full, zero160, v362, v107, 90); + svfloat32_t zero167 = svdup_n_f32(0); + svfloat32_t v167 = svcmla_f32_x(pred_full, zero167, v363, v108, 90); + svfloat32_t zero174 = svdup_n_f32(0); + svfloat32_t v174 = svcmla_f32_x(pred_full, zero174, v364, v109, 90); + svfloat32_t v92 = svadd_f32_x(svptrue_b32(), v91, v70); + svfloat32_t v119 = svmul_f32_x(svptrue_b32(), v91, v355); + svfloat32_t zero126 = svdup_n_f32(0); + svfloat32_t v126 = svcmla_f32_x(pred_full, zero126, v358, v103, 90); + svfloat32_t v188 = svadd_f32_x(svptrue_b32(), v138, v160); + svfloat32_t v190 = svsub_f32_x(svptrue_b32(), v138, v167); + svfloat32_t v192 = svsub_f32_x(svptrue_b32(), v138, v160); + svfloat32_t v101 = svadd_f32_x(svptrue_b32(), v92, v100); + svfloat32_t v175 = svadd_f32_x(svptrue_b32(), v119, v119); + svfloat32_t v189 = svadd_f32_x(svptrue_b32(), v188, v167); + svfloat32_t v191 = svadd_f32_x(svptrue_b32(), v190, v174); + svfloat32_t v193 = svsub_f32_x(svptrue_b32(), v192, v174); + svfloat32_t v176 = svmla_f32_x(pred_full, v175, v91, v355); + svfloat32_t v180 = svmla_f32_x(pred_full, v101, v70, v357); + svint16_t v202 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v101, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v177 = svadd_f32_x(svptrue_b32(), v101, v176); + svfloat32_t v181 = svadd_f32_x(svptrue_b32(), v180, v175); + svst1w_u64(pred_full, (unsigned *)(v372), svreinterpret_u64_s16(v202)); + svfloat32_t v178 = svadd_f32_x(svptrue_b32(), v177, v126); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v177, v126); + svfloat32_t v182 = svmla_f32_x(pred_full, v181, v104, v359); + svfloat32_t v184 = svmls_f32_x(pred_full, v181, v105, v360); + svfloat32_t v186 = svmls_f32_x(pred_full, v181, v104, v359); + svfloat32_t v183 = svmla_f32_x(pred_full, v182, v105, v360); + svfloat32_t v185 = svmla_f32_x(pred_full, v184, v106, v361); + svfloat32_t v187 = svmls_f32_x(pred_full, v186, v106, v361); + svint16_t v226 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v179, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v250 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v178, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v194 = svadd_f32_x(svptrue_b32(), v183, v189); + svfloat32_t v195 = svsub_f32_x(svptrue_b32(), v183, v189); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v185, v191); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v185, v191); + svfloat32_t v198 = svadd_f32_x(svptrue_b32(), v187, v193); + svfloat32_t v199 = svsub_f32_x(svptrue_b32(), v187, v193); + svst1w_u64(pred_full, (unsigned *)(v399), svreinterpret_u64_s16(v226)); + svst1w_u64(pred_full, (unsigned *)(v426), svreinterpret_u64_s16(v250)); + svint16_t v210 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v195, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v218 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v196, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v234 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v199, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v242 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v198, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v258 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v197, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v266 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v194, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v381), svreinterpret_u64_s16(v210)); + svst1w_u64(pred_full, (unsigned *)(v390), svreinterpret_u64_s16(v218)); + svst1w_u64(pred_full, (unsigned *)(v408), svreinterpret_u64_s16(v234)); + svst1w_u64(pred_full, (unsigned *)(v417), svreinterpret_u64_s16(v242)); + svst1w_u64(pred_full, (unsigned *)(v435), svreinterpret_u64_s16(v258)); + svst1w_u64(pred_full, (unsigned *)(v444), svreinterpret_u64_s16(v266)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu10(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v68 = vld1s_s16(&v5[istride]); + float v149 = -1.2500000000000000e+00F; + float v153 = 5.5901699437494745e-01F; + float v156 = 1.5388417685876268e+00F; + float v157 = -1.5388417685876268e+00F; + float v163 = 5.8778525229247325e-01F; + float v164 = -5.8778525229247325e-01F; + float v170 = 3.6327126400268028e-01F; + float v171 = -3.6327126400268028e-01F; + float32x2_t v173 = (float32x2_t){v4, v4}; + int16x4_t v20 = vld1s_s16(&v5[0]); + float32x2_t v69 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v68)), 15); + float32x2_t v150 = (float32x2_t){v149, v149}; + float32x2_t v154 = (float32x2_t){v153, v153}; + float32x2_t v158 = (float32x2_t){v156, v157}; + float32x2_t v165 = (float32x2_t){v163, v164}; + float32x2_t v172 = (float32x2_t){v170, v171}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v26 = vld1s_s16(&v5[istride * 5]); + int16x4_t v34 = vld1s_s16(&v5[istride * 2]); + int16x4_t v40 = vld1s_s16(&v5[istride * 7]); + int16x4_t v48 = vld1s_s16(&v5[istride * 4]); + int16x4_t v54 = vld1s_s16(&v5[istride * 9]); + int16x4_t v62 = vld1s_s16(&v5[istride * 6]); + int16x4_t v76 = vld1s_s16(&v5[istride * 8]); + int16x4_t v82 = vld1s_s16(&v5[istride * 3]); + float32x2_t v160 = vmul_f32(v173, v158); + float32x2_t v167 = vmul_f32(v173, v165); + float32x2_t v174 = vmul_f32(v173, v172); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v49 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v48)), 15); + float32x2_t v55 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v54)), 15); + float32x2_t v63 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v62)), 15); + float32x2_t v77 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v76)), 15); + float32x2_t v83 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v82)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v56 = vadd_f32(v49, v55); + float32x2_t v57 = vsub_f32(v49, v55); + float32x2_t v70 = vadd_f32(v63, v69); + float32x2_t v71 = vsub_f32(v63, v69); + float32x2_t v84 = vadd_f32(v77, v83); + float32x2_t v85 = vsub_f32(v77, v83); + float32x2_t v86 = vadd_f32(v42, v84); + float32x2_t v87 = vsub_f32(v42, v84); + float32x2_t v88 = vadd_f32(v70, v56); + float32x2_t v89 = vsub_f32(v70, v56); + float32x2_t v136 = vadd_f32(v43, v85); + float32x2_t v137 = vsub_f32(v43, v85); + float32x2_t v138 = vadd_f32(v71, v57); + float32x2_t v139 = vsub_f32(v71, v57); + float32x2_t v90 = vadd_f32(v86, v88); + float32x2_t v91 = vsub_f32(v86, v88); + float32x2_t v92 = vadd_f32(v87, v89); + float32x2_t v111 = vrev64_f32(v87); + float32x2_t v125 = vrev64_f32(v89); + float32x2_t v140 = vadd_f32(v136, v138); + float32x2_t v141 = vsub_f32(v136, v138); + float32x2_t v142 = vadd_f32(v137, v139); + float32x2_t v161 = vrev64_f32(v137); + float32x2_t v175 = vrev64_f32(v139); + float32x2_t v93 = vadd_f32(v90, v28); + float32x2_t v101 = vmul_f32(v90, v150); + float32x2_t v105 = vmul_f32(v91, v154); + float32x2_t v112 = vmul_f32(v111, v160); + float32x2_t v118 = vrev64_f32(v92); + float32x2_t v126 = vmul_f32(v125, v174); + float32x2_t v143 = vadd_f32(v140, v29); + float32x2_t v151 = vmul_f32(v140, v150); + float32x2_t v155 = vmul_f32(v141, v154); + float32x2_t v162 = vmul_f32(v161, v160); + float32x2_t v168 = vrev64_f32(v142); + float32x2_t v176 = vmul_f32(v175, v174); + float32x2_t v119 = vmul_f32(v118, v167); + float32x2_t v127 = vadd_f32(v93, v101); + float32x2_t v169 = vmul_f32(v168, v167); + float32x2_t v177 = vadd_f32(v143, v151); + int16x4_t v188 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v93, 15), (int32x2_t){0, 0})); + int16x4_t v194 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v143, 15), (int32x2_t){0, 0})); + float32x2_t v128 = vadd_f32(v127, v105); + float32x2_t v129 = vsub_f32(v127, v105); + float32x2_t v130 = vsub_f32(v112, v119); + float32x2_t v131 = vadd_f32(v119, v126); + float32x2_t v178 = vadd_f32(v177, v155); + float32x2_t v179 = vsub_f32(v177, v155); + float32x2_t v180 = vsub_f32(v162, v169); + float32x2_t v181 = vadd_f32(v169, v176); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v188), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v194), 0); + float32x2_t v132 = vadd_f32(v128, v130); + float32x2_t v133 = vsub_f32(v128, v130); + float32x2_t v134 = vadd_f32(v129, v131); + float32x2_t v135 = vsub_f32(v129, v131); + float32x2_t v182 = vadd_f32(v178, v180); + float32x2_t v183 = vsub_f32(v178, v180); + float32x2_t v184 = vadd_f32(v179, v181); + float32x2_t v185 = vsub_f32(v179, v181); + int16x4_t v200 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v133, 15), (int32x2_t){0, 0})); + int16x4_t v206 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v183, 15), (int32x2_t){0, 0})); + int16x4_t v212 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v135, 15), (int32x2_t){0, 0})); + int16x4_t v218 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v185, 15), (int32x2_t){0, 0})); + int16x4_t v224 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v134, 15), (int32x2_t){0, 0})); + int16x4_t v230 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v184, 15), (int32x2_t){0, 0})); + int16x4_t v236 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v132, 15), (int32x2_t){0, 0})); + int16x4_t v242 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v182, 15), (int32x2_t){0, 0})); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v200), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v206), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v212), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v218), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v224), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v230), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v236), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v242), 0); + v5 += 1 * 1; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu10(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * 1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v175 = -1.2500000000000000e+00F; + float v180 = 5.5901699437494745e-01F; + float v185 = -1.5388417685876268e+00F; + float v192 = -5.8778525229247325e-01F; + float v199 = -3.6327126400268028e-01F; + const int32_t *v364 = &v5[v0]; + int32_t *v431 = &v6[v2]; + int64_t v27 = v0 * 5; + int64_t v37 = v0 * 2; + int64_t v45 = v0 * 7; + int64_t v55 = v0 * 4; + int64_t v63 = v0 * 9; + int64_t v73 = v0 * 6; + int64_t v91 = v0 * 8; + int64_t v99 = v0 * 3; + float v188 = v4 * v185; + float v195 = v4 * v192; + float v202 = v4 * v199; + int64_t v223 = v2 * 5; + int64_t v231 = v2 * 6; + int64_t v247 = v2 * 2; + int64_t v255 = v2 * 7; + int64_t v263 = v2 * 8; + int64_t v271 = v2 * 3; + int64_t v279 = v2 * 4; + int64_t v287 = v2 * 9; + const int32_t *v301 = &v5[0]; + svfloat32_t v392 = svdup_n_f32(v175); + svfloat32_t v393 = svdup_n_f32(v180); + int32_t *v404 = &v6[0]; + svfloat32_t v87 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v364[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v310 = &v5[v27]; + const int32_t *v319 = &v5[v37]; + const int32_t *v328 = &v5[v45]; + const int32_t *v337 = &v5[v55]; + const int32_t *v346 = &v5[v63]; + const int32_t *v355 = &v5[v73]; + const int32_t *v373 = &v5[v91]; + const int32_t *v382 = &v5[v99]; + svfloat32_t v394 = svdup_n_f32(v188); + svfloat32_t v395 = svdup_n_f32(v195); + svfloat32_t v396 = svdup_n_f32(v202); + int32_t *v413 = &v6[v223]; + int32_t *v422 = &v6[v231]; + int32_t *v440 = &v6[v247]; + int32_t *v449 = &v6[v255]; + int32_t *v458 = &v6[v263]; + int32_t *v467 = &v6[v271]; + int32_t *v476 = &v6[v279]; + int32_t *v485 = &v6[v287]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v301[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v310[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v319[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v51 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v328[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v61 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v337[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v69 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v346[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v79 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v355[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v97 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v373[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v105 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v382[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v70 = svadd_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v71 = svsub_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v106 = svadd_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v108 = svadd_f32_x(svptrue_b32(), v52, v106); + svfloat32_t v109 = svsub_f32_x(svptrue_b32(), v52, v106); + svfloat32_t v110 = svadd_f32_x(svptrue_b32(), v88, v70); + svfloat32_t v111 = svsub_f32_x(svptrue_b32(), v88, v70); + svfloat32_t v161 = svadd_f32_x(svptrue_b32(), v53, v107); + svfloat32_t v162 = svsub_f32_x(svptrue_b32(), v53, v107); + svfloat32_t v163 = svadd_f32_x(svptrue_b32(), v89, v71); + svfloat32_t v164 = svsub_f32_x(svptrue_b32(), v89, v71); + svfloat32_t v112 = svadd_f32_x(svptrue_b32(), v108, v110); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v108, v110); + svfloat32_t v114 = svadd_f32_x(svptrue_b32(), v109, v111); + svfloat32_t zero137 = svdup_n_f32(0); + svfloat32_t v137 = svcmla_f32_x(pred_full, zero137, v394, v109, 90); + svfloat32_t v165 = svadd_f32_x(svptrue_b32(), v161, v163); + svfloat32_t v166 = svsub_f32_x(svptrue_b32(), v161, v163); + svfloat32_t v167 = svadd_f32_x(svptrue_b32(), v162, v164); + svfloat32_t zero190 = svdup_n_f32(0); + svfloat32_t v190 = svcmla_f32_x(pred_full, zero190, v394, v162, 90); + svfloat32_t v115 = svadd_f32_x(svptrue_b32(), v112, v34); + svfloat32_t zero144 = svdup_n_f32(0); + svfloat32_t v144 = svcmla_f32_x(pred_full, zero144, v395, v114, 90); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v165, v35); + svfloat32_t zero197 = svdup_n_f32(0); + svfloat32_t v197 = svcmla_f32_x(pred_full, zero197, v395, v167, 90); + svfloat32_t v152 = svmla_f32_x(pred_full, v115, v112, v392); + svfloat32_t v155 = svsub_f32_x(svptrue_b32(), v137, v144); + svfloat32_t v156 = svcmla_f32_x(pred_full, v144, v396, v111, 90); + svfloat32_t v205 = svmla_f32_x(pred_full, v168, v165, v392); + svfloat32_t v208 = svsub_f32_x(svptrue_b32(), v190, v197); + svfloat32_t v209 = svcmla_f32_x(pred_full, v197, v396, v164, 90); + svint16_t v216 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v115, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v224 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v168, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v153 = svmla_f32_x(pred_full, v152, v113, v393); + svfloat32_t v154 = svmls_f32_x(pred_full, v152, v113, v393); + svfloat32_t v206 = svmla_f32_x(pred_full, v205, v166, v393); + svfloat32_t v207 = svmls_f32_x(pred_full, v205, v166, v393); + svst1w_u64(pred_full, (unsigned *)(v404), svreinterpret_u64_s16(v216)); + svst1w_u64(pred_full, (unsigned *)(v413), svreinterpret_u64_s16(v224)); + svfloat32_t v157 = svadd_f32_x(svptrue_b32(), v153, v155); + svfloat32_t v158 = svsub_f32_x(svptrue_b32(), v153, v155); + svfloat32_t v159 = svadd_f32_x(svptrue_b32(), v154, v156); + svfloat32_t v160 = svsub_f32_x(svptrue_b32(), v154, v156); + svfloat32_t v210 = svadd_f32_x(svptrue_b32(), v206, v208); + svfloat32_t v211 = svsub_f32_x(svptrue_b32(), v206, v208); + svfloat32_t v212 = svadd_f32_x(svptrue_b32(), v207, v209); + svfloat32_t v213 = svsub_f32_x(svptrue_b32(), v207, v209); + svint16_t v232 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v158, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v240 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v211, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v248 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v160, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v256 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v213, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v264 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v159, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v272 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v212, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v280 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v157, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v288 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v210, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v422), svreinterpret_u64_s16(v232)); + svst1w_u64(pred_full, (unsigned *)(v431), svreinterpret_u64_s16(v240)); + svst1w_u64(pred_full, (unsigned *)(v440), svreinterpret_u64_s16(v248)); + svst1w_u64(pred_full, (unsigned *)(v449), svreinterpret_u64_s16(v256)); + svst1w_u64(pred_full, (unsigned *)(v458), svreinterpret_u64_s16(v264)); + svst1w_u64(pred_full, (unsigned *)(v467), svreinterpret_u64_s16(v272)); + svst1w_u64(pred_full, (unsigned *)(v476), svreinterpret_u64_s16(v280)); + svst1w_u64(pred_full, (unsigned *)(v485), svreinterpret_u64_s16(v288)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu11(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v20 = vld1s_s16(&v5[istride]); + float v124 = 1.1000000000000001e+00F; + float v127 = 3.3166247903554003e-01F; + float v128 = -3.3166247903554003e-01F; + float v135 = 5.1541501300188641e-01F; + float v139 = 9.4125353283118118e-01F; + float v143 = 1.4143537075597825e+00F; + float v147 = 8.5949297361449750e-01F; + float v151 = 4.2314838273285138e-02F; + float v155 = 3.8639279888589606e-01F; + float v159 = 5.1254589567200015e-01F; + float v163 = 1.0702757469471715e+00F; + float v167 = 5.5486073394528512e-01F; + float v170 = 1.2412944743900585e+00F; + float v171 = -1.2412944743900585e+00F; + float v177 = 2.0897833842005756e-01F; + float v178 = -2.0897833842005756e-01F; + float v184 = 3.7415717312460811e-01F; + float v185 = -3.7415717312460811e-01F; + float v191 = 4.9929922194110327e-02F; + float v192 = -4.9929922194110327e-02F; + float v198 = 6.5815896284539266e-01F; + float v199 = -6.5815896284539266e-01F; + float v205 = 6.3306543373877577e-01F; + float v206 = -6.3306543373877577e-01F; + float v212 = 1.0822460581641109e+00F; + float v213 = -1.0822460581641109e+00F; + float v219 = 8.1720737907134022e-01F; + float v220 = -8.1720737907134022e-01F; + float v226 = 4.2408709531871824e-01F; + float v227 = -4.2408709531871824e-01F; + float32x2_t v229 = (float32x2_t){v4, v4}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v96 = vld1s_s16(&v5[0]); + float32x2_t v125 = (float32x2_t){v124, v124}; + float32x2_t v129 = (float32x2_t){v127, v128}; + float32x2_t v136 = (float32x2_t){v135, v135}; + float32x2_t v140 = (float32x2_t){v139, v139}; + float32x2_t v144 = (float32x2_t){v143, v143}; + float32x2_t v148 = (float32x2_t){v147, v147}; + float32x2_t v152 = (float32x2_t){v151, v151}; + float32x2_t v156 = (float32x2_t){v155, v155}; + float32x2_t v160 = (float32x2_t){v159, v159}; + float32x2_t v164 = (float32x2_t){v163, v163}; + float32x2_t v168 = (float32x2_t){v167, v167}; + float32x2_t v172 = (float32x2_t){v170, v171}; + float32x2_t v179 = (float32x2_t){v177, v178}; + float32x2_t v186 = (float32x2_t){v184, v185}; + float32x2_t v193 = (float32x2_t){v191, v192}; + float32x2_t v200 = (float32x2_t){v198, v199}; + float32x2_t v207 = (float32x2_t){v205, v206}; + float32x2_t v214 = (float32x2_t){v212, v213}; + float32x2_t v221 = (float32x2_t){v219, v220}; + float32x2_t v228 = (float32x2_t){v226, v227}; + int16x4_t v26 = vld1s_s16(&v5[istride * 10]); + int16x4_t v33 = vld1s_s16(&v5[istride * 2]); + int16x4_t v39 = vld1s_s16(&v5[istride * 9]); + int16x4_t v46 = vld1s_s16(&v5[istride * 3]); + int16x4_t v52 = vld1s_s16(&v5[istride * 8]); + int16x4_t v59 = vld1s_s16(&v5[istride * 4]); + int16x4_t v65 = vld1s_s16(&v5[istride * 7]); + int16x4_t v72 = vld1s_s16(&v5[istride * 5]); + int16x4_t v78 = vld1s_s16(&v5[istride * 6]); + float32x2_t v97 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v96)), 15); + float32x2_t v131 = vmul_f32(v229, v129); + float32x2_t v174 = vmul_f32(v229, v172); + float32x2_t v181 = vmul_f32(v229, v179); + float32x2_t v188 = vmul_f32(v229, v186); + float32x2_t v195 = vmul_f32(v229, v193); + float32x2_t v202 = vmul_f32(v229, v200); + float32x2_t v209 = vmul_f32(v229, v207); + float32x2_t v216 = vmul_f32(v229, v214); + float32x2_t v223 = vmul_f32(v229, v221); + float32x2_t v230 = vmul_f32(v229, v228); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v34 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v33)), 15); + float32x2_t v40 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v39)), 15); + float32x2_t v47 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v46)), 15); + float32x2_t v53 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v52)), 15); + float32x2_t v60 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v59)), 15); + float32x2_t v66 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v65)), 15); + float32x2_t v73 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v72)), 15); + float32x2_t v79 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v78)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v41 = vadd_f32(v34, v40); + float32x2_t v54 = vadd_f32(v47, v53); + float32x2_t v67 = vadd_f32(v60, v66); + float32x2_t v80 = vadd_f32(v73, v79); + float32x2_t v81 = vsub_f32(v21, v27); + float32x2_t v82 = vsub_f32(v34, v40); + float32x2_t v83 = vsub_f32(v47, v53); + float32x2_t v84 = vsub_f32(v60, v66); + float32x2_t v85 = vsub_f32(v73, v79); + float32x2_t v86 = vadd_f32(v28, v41); + float32x2_t v87 = vadd_f32(v54, v80); + float32x2_t v89 = vsub_f32(v82, v83); + float32x2_t v90 = vadd_f32(v81, v85); + float32x2_t v101 = vsub_f32(v41, v67); + float32x2_t v102 = vsub_f32(v28, v67); + float32x2_t v103 = vsub_f32(v41, v28); + float32x2_t v104 = vsub_f32(v80, v67); + float32x2_t v105 = vsub_f32(v54, v67); + float32x2_t v106 = vsub_f32(v80, v54); + float32x2_t v107 = vsub_f32(v41, v80); + float32x2_t v108 = vsub_f32(v28, v54); + float32x2_t v110 = vadd_f32(v82, v84); + float32x2_t v111 = vsub_f32(v81, v84); + float32x2_t v112 = vadd_f32(v81, v82); + float32x2_t v113 = vsub_f32(v84, v85); + float32x2_t v114 = vsub_f32(v83, v84); + float32x2_t v115 = vsub_f32(v83, v85); + float32x2_t v116 = vadd_f32(v82, v85); + float32x2_t v117 = vsub_f32(v81, v83); + float32x2_t v88 = vadd_f32(v67, v86); + float32x2_t v99 = vsub_f32(v89, v90); + float32x2_t v109 = vsub_f32(v87, v86); + float32x2_t v118 = vadd_f32(v89, v90); + float32x2_t v137 = vmul_f32(v101, v136); + float32x2_t v141 = vmul_f32(v102, v140); + float32x2_t v145 = vmul_f32(v103, v144); + float32x2_t v149 = vmul_f32(v104, v148); + float32x2_t v153 = vmul_f32(v105, v152); + float32x2_t v157 = vmul_f32(v106, v156); + float32x2_t v161 = vmul_f32(v107, v160); + float32x2_t v165 = vmul_f32(v108, v164); + float32x2_t v175 = vrev64_f32(v110); + float32x2_t v182 = vrev64_f32(v111); + float32x2_t v189 = vrev64_f32(v112); + float32x2_t v196 = vrev64_f32(v113); + float32x2_t v203 = vrev64_f32(v114); + float32x2_t v210 = vrev64_f32(v115); + float32x2_t v217 = vrev64_f32(v116); + float32x2_t v224 = vrev64_f32(v117); + float32x2_t v91 = vadd_f32(v88, v87); + float32x2_t v100 = vsub_f32(v99, v84); + float32x2_t v169 = vmul_f32(v109, v168); + float32x2_t v176 = vmul_f32(v175, v174); + float32x2_t v183 = vmul_f32(v182, v181); + float32x2_t v190 = vmul_f32(v189, v188); + float32x2_t v197 = vmul_f32(v196, v195); + float32x2_t v204 = vmul_f32(v203, v202); + float32x2_t v211 = vmul_f32(v210, v209); + float32x2_t v218 = vmul_f32(v217, v216); + float32x2_t v225 = vmul_f32(v224, v223); + float32x2_t v231 = vrev64_f32(v118); + float32x2_t v234 = vadd_f32(v137, v141); + float32x2_t v235 = vadd_f32(v141, v145); + float32x2_t v236 = vsub_f32(v137, v145); + float32x2_t v237 = vadd_f32(v149, v153); + float32x2_t v238 = vadd_f32(v153, v157); + float32x2_t v239 = vsub_f32(v149, v157); + float32x2_t v98 = vadd_f32(v97, v91); + float32x2_t v126 = vmul_f32(v91, v125); + float32x2_t v132 = vrev64_f32(v100); + float32x2_t v232 = vmul_f32(v231, v230); + float32x2_t v240 = vadd_f32(v165, v169); + float32x2_t v241 = vadd_f32(v161, v169); + float32x2_t v242 = vadd_f32(v183, v190); + float32x2_t v243 = vsub_f32(v176, v190); + float32x2_t v244 = vadd_f32(v204, v211); + float32x2_t v245 = vsub_f32(v197, v211); + float32x2_t v133 = vmul_f32(v132, v131); + float32x2_t v233 = vsub_f32(v98, v126); + float32x2_t v246 = vadd_f32(v225, v232); + float32x2_t v247 = vsub_f32(v218, v232); + float32x2_t v248 = vadd_f32(v238, v240); + float32x2_t v266 = vadd_f32(v242, v243); + int16x4_t v282 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v98, 15), (int32x2_t){0, 0})); + float32x2_t v249 = vadd_f32(v248, v233); + float32x2_t v250 = vsub_f32(v233, v235); + float32x2_t v252 = vadd_f32(v233, v239); + float32x2_t v254 = vsub_f32(v233, v236); + float32x2_t v256 = vadd_f32(v233, v234); + float32x2_t v258 = vadd_f32(v133, v244); + float32x2_t v260 = vsub_f32(v246, v242); + float32x2_t v262 = vadd_f32(v133, v247); + float32x2_t v264 = vsub_f32(v247, v243); + float32x2_t v267 = vadd_f32(v266, v244); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v282), 0); + float32x2_t v251 = vsub_f32(v250, v240); + float32x2_t v253 = vadd_f32(v252, v241); + float32x2_t v255 = vsub_f32(v254, v241); + float32x2_t v257 = vsub_f32(v256, v237); + float32x2_t v259 = vadd_f32(v258, v246); + float32x2_t v261 = vsub_f32(v260, v133); + float32x2_t v263 = vadd_f32(v262, v245); + float32x2_t v265 = vsub_f32(v264, v133); + float32x2_t v268 = vadd_f32(v267, v245); + float32x2_t v269 = vsub_f32(v268, v133); + float32x2_t v271 = vadd_f32(v249, v259); + float32x2_t v272 = vadd_f32(v251, v261); + float32x2_t v273 = vsub_f32(v253, v263); + float32x2_t v274 = vadd_f32(v255, v265); + float32x2_t v275 = vsub_f32(v255, v265); + float32x2_t v276 = vadd_f32(v253, v263); + float32x2_t v277 = vsub_f32(v251, v261); + float32x2_t v278 = vsub_f32(v249, v259); + float32x2_t v270 = vadd_f32(v257, v269); + float32x2_t v279 = vsub_f32(v257, v269); + int16x4_t v294 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v271, 15), (int32x2_t){0, 0})); + int16x4_t v300 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v272, 15), (int32x2_t){0, 0})); + int16x4_t v306 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v273, 15), (int32x2_t){0, 0})); + int16x4_t v312 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v274, 15), (int32x2_t){0, 0})); + int16x4_t v318 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v275, 15), (int32x2_t){0, 0})); + int16x4_t v324 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v276, 15), (int32x2_t){0, 0})); + int16x4_t v330 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v277, 15), (int32x2_t){0, 0})); + int16x4_t v336 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v278, 15), (int32x2_t){0, 0})); + int16x4_t v288 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v270, 15), (int32x2_t){0, 0})); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v294), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v300), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v306), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v312), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v318), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v324), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v330), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v336), 0); + int16x4_t v342 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v279, 15), (int32x2_t){0, 0})); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v288), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v342), 0); + v5 += 1 * 1; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu11(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * 1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v149 = 1.1000000000000001e+00F; + float v154 = -3.3166247903554003e-01F; + float v161 = 5.1541501300188641e-01F; + float v166 = 9.4125353283118118e-01F; + float v171 = 1.4143537075597825e+00F; + float v176 = 8.5949297361449750e-01F; + float v181 = 4.2314838273285138e-02F; + float v186 = 3.8639279888589606e-01F; + float v191 = 5.1254589567200015e-01F; + float v196 = 1.0702757469471715e+00F; + float v201 = 5.5486073394528512e-01F; + float v206 = -1.2412944743900585e+00F; + float v213 = -2.0897833842005756e-01F; + float v220 = -3.7415717312460811e-01F; + float v227 = -4.9929922194110327e-02F; + float v234 = -6.5815896284539266e-01F; + float v241 = -6.3306543373877577e-01F; + float v248 = -1.0822460581641109e+00F; + float v255 = -8.1720737907134022e-01F; + float v262 = -4.2408709531871824e-01F; + const int32_t *v409 = &v5[v0]; + int32_t *v621 = &v6[v2]; + int64_t v27 = v0 * 10; + int64_t v36 = v0 * 2; + int64_t v44 = v0 * 9; + int64_t v53 = v0 * 3; + int64_t v61 = v0 * 8; + int64_t v70 = v0 * 4; + int64_t v78 = v0 * 7; + int64_t v87 = v0 * 5; + int64_t v95 = v0 * 6; + float v157 = v4 * v154; + float v209 = v4 * v206; + float v216 = v4 * v213; + float v223 = v4 * v220; + float v230 = v4 * v227; + float v237 = v4 * v234; + float v244 = v4 * v241; + float v251 = v4 * v248; + float v258 = v4 * v255; + float v265 = v4 * v262; + int64_t v324 = v2 * 10; + int64_t v332 = v2 * 9; + int64_t v340 = v2 * 8; + int64_t v348 = v2 * 7; + int64_t v356 = v2 * 6; + int64_t v364 = v2 * 5; + int64_t v372 = v2 * 4; + int64_t v380 = v2 * 3; + int64_t v388 = v2 * 2; + const int32_t *v500 = &v5[0]; + svfloat32_t v504 = svdup_n_f32(v149); + svfloat32_t v506 = svdup_n_f32(v161); + svfloat32_t v507 = svdup_n_f32(v166); + svfloat32_t v508 = svdup_n_f32(v171); + svfloat32_t v509 = svdup_n_f32(v176); + svfloat32_t v510 = svdup_n_f32(v181); + svfloat32_t v511 = svdup_n_f32(v186); + svfloat32_t v512 = svdup_n_f32(v191); + svfloat32_t v513 = svdup_n_f32(v196); + svfloat32_t v514 = svdup_n_f32(v201); + int32_t *v531 = &v6[0]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v409[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v418 = &v5[v27]; + const int32_t *v427 = &v5[v36]; + const int32_t *v436 = &v5[v44]; + const int32_t *v445 = &v5[v53]; + const int32_t *v454 = &v5[v61]; + const int32_t *v463 = &v5[v70]; + const int32_t *v472 = &v5[v78]; + const int32_t *v481 = &v5[v87]; + const int32_t *v490 = &v5[v95]; + svfloat32_t v505 = svdup_n_f32(v157); + svfloat32_t v515 = svdup_n_f32(v209); + svfloat32_t v516 = svdup_n_f32(v216); + svfloat32_t v517 = svdup_n_f32(v223); + svfloat32_t v518 = svdup_n_f32(v230); + svfloat32_t v519 = svdup_n_f32(v237); + svfloat32_t v520 = svdup_n_f32(v244); + svfloat32_t v521 = svdup_n_f32(v251); + svfloat32_t v522 = svdup_n_f32(v258); + svfloat32_t v523 = svdup_n_f32(v265); + int32_t *v540 = &v6[v324]; + int32_t *v549 = &v6[v332]; + int32_t *v558 = &v6[v340]; + int32_t *v567 = &v6[v348]; + int32_t *v576 = &v6[v356]; + int32_t *v585 = &v6[v364]; + int32_t *v594 = &v6[v372]; + int32_t *v603 = &v6[v380]; + int32_t *v612 = &v6[v388]; + svfloat32_t v121 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v500[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v418[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v42 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v427[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v50 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v436[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v59 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v445[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v67 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v454[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v76 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v463[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v84 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v472[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v93 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v481[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v101 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v490[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v51 = svadd_f32_x(svptrue_b32(), v42, v50); + svfloat32_t v68 = svadd_f32_x(svptrue_b32(), v59, v67); + svfloat32_t v85 = svadd_f32_x(svptrue_b32(), v76, v84); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v93, v101); + svfloat32_t v103 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v104 = svsub_f32_x(svptrue_b32(), v42, v50); + svfloat32_t v105 = svsub_f32_x(svptrue_b32(), v59, v67); + svfloat32_t v106 = svsub_f32_x(svptrue_b32(), v76, v84); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v93, v101); + svfloat32_t v108 = svadd_f32_x(svptrue_b32(), v34, v51); + svfloat32_t v109 = svadd_f32_x(svptrue_b32(), v68, v102); + svfloat32_t v111 = svsub_f32_x(svptrue_b32(), v104, v105); + svfloat32_t v112 = svadd_f32_x(svptrue_b32(), v103, v107); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v51, v85); + svfloat32_t v126 = svsub_f32_x(svptrue_b32(), v34, v85); + svfloat32_t v127 = svsub_f32_x(svptrue_b32(), v51, v34); + svfloat32_t v128 = svsub_f32_x(svptrue_b32(), v102, v85); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v68, v85); + svfloat32_t v130 = svsub_f32_x(svptrue_b32(), v102, v68); + svfloat32_t v131 = svsub_f32_x(svptrue_b32(), v51, v102); + svfloat32_t v132 = svsub_f32_x(svptrue_b32(), v34, v68); + svfloat32_t v134 = svadd_f32_x(svptrue_b32(), v104, v106); + svfloat32_t v135 = svsub_f32_x(svptrue_b32(), v103, v106); + svfloat32_t v136 = svadd_f32_x(svptrue_b32(), v103, v104); + svfloat32_t v137 = svsub_f32_x(svptrue_b32(), v106, v107); + svfloat32_t v138 = svsub_f32_x(svptrue_b32(), v105, v106); + svfloat32_t v139 = svsub_f32_x(svptrue_b32(), v105, v107); + svfloat32_t v140 = svadd_f32_x(svptrue_b32(), v104, v107); + svfloat32_t v141 = svsub_f32_x(svptrue_b32(), v103, v105); + svfloat32_t v110 = svadd_f32_x(svptrue_b32(), v85, v108); + svfloat32_t v123 = svsub_f32_x(svptrue_b32(), v111, v112); + svfloat32_t v133 = svsub_f32_x(svptrue_b32(), v109, v108); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v111, v112); + svfloat32_t v169 = svmul_f32_x(svptrue_b32(), v126, v507); + svfloat32_t v174 = svmul_f32_x(svptrue_b32(), v127, v508); + svfloat32_t v184 = svmul_f32_x(svptrue_b32(), v129, v510); + svfloat32_t v189 = svmul_f32_x(svptrue_b32(), v130, v511); + svfloat32_t zero211 = svdup_n_f32(0); + svfloat32_t v211 = svcmla_f32_x(pred_full, zero211, v515, v134, 90); + svfloat32_t zero225 = svdup_n_f32(0); + svfloat32_t v225 = svcmla_f32_x(pred_full, zero225, v517, v136, 90); + svfloat32_t zero232 = svdup_n_f32(0); + svfloat32_t v232 = svcmla_f32_x(pred_full, zero232, v518, v137, 90); + svfloat32_t zero246 = svdup_n_f32(0); + svfloat32_t v246 = svcmla_f32_x(pred_full, zero246, v520, v139, 90); + svfloat32_t zero253 = svdup_n_f32(0); + svfloat32_t v253 = svcmla_f32_x(pred_full, zero253, v521, v140, 90); + svfloat32_t v113 = svadd_f32_x(svptrue_b32(), v110, v109); + svfloat32_t v124 = svsub_f32_x(svptrue_b32(), v123, v106); + svfloat32_t v204 = svmul_f32_x(svptrue_b32(), v133, v514); + svfloat32_t zero267 = svdup_n_f32(0); + svfloat32_t v267 = svcmla_f32_x(pred_full, zero267, v523, v142, 90); + svfloat32_t v269 = svmla_f32_x(pred_full, v169, v125, v506); + svfloat32_t v270 = svmla_f32_x(pred_full, v174, v126, v507); + svfloat32_t v271 = svnmls_f32_x(pred_full, v174, v125, v506); + svfloat32_t v272 = svmla_f32_x(pred_full, v184, v128, v509); + svfloat32_t v273 = svmla_f32_x(pred_full, v189, v129, v510); + svfloat32_t v274 = svnmls_f32_x(pred_full, v189, v128, v509); + svfloat32_t v277 = svcmla_f32_x(pred_full, v225, v516, v135, 90); + svfloat32_t v278 = svsub_f32_x(svptrue_b32(), v211, v225); + svfloat32_t v279 = svcmla_f32_x(pred_full, v246, v519, v138, 90); + svfloat32_t v280 = svsub_f32_x(svptrue_b32(), v232, v246); + svfloat32_t v122 = svadd_f32_x(svptrue_b32(), v121, v113); + svfloat32_t zero159 = svdup_n_f32(0); + svfloat32_t v159 = svcmla_f32_x(pred_full, zero159, v505, v124, 90); + svfloat32_t v275 = svmla_f32_x(pred_full, v204, v132, v513); + svfloat32_t v276 = svmla_f32_x(pred_full, v204, v131, v512); + svfloat32_t v281 = svcmla_f32_x(pred_full, v267, v522, v141, 90); + svfloat32_t v282 = svsub_f32_x(svptrue_b32(), v253, v267); + svfloat32_t v301 = svadd_f32_x(svptrue_b32(), v277, v278); + svfloat32_t v268 = svmls_f32_x(pred_full, v122, v113, v504); + svfloat32_t v283 = svadd_f32_x(svptrue_b32(), v273, v275); + svfloat32_t v293 = svadd_f32_x(svptrue_b32(), v159, v279); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v281, v277); + svfloat32_t v297 = svadd_f32_x(svptrue_b32(), v159, v282); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v282, v278); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v301, v279); + svint16_t v317 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v122, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v283, v268); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v268, v270); + svfloat32_t v287 = svadd_f32_x(svptrue_b32(), v268, v274); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v268, v271); + svfloat32_t v291 = svadd_f32_x(svptrue_b32(), v268, v269); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v293, v281); + svfloat32_t v296 = svsub_f32_x(svptrue_b32(), v295, v159); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v297, v280); + svfloat32_t v300 = svsub_f32_x(svptrue_b32(), v299, v159); + svfloat32_t v303 = svadd_f32_x(svptrue_b32(), v302, v280); + svst1w_u64(pred_full, (unsigned *)(v531), svreinterpret_u64_s16(v317)); + svfloat32_t v286 = svsub_f32_x(svptrue_b32(), v285, v275); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v287, v276); + svfloat32_t v290 = svsub_f32_x(svptrue_b32(), v289, v276); + svfloat32_t v292 = svsub_f32_x(svptrue_b32(), v291, v272); + svfloat32_t v304 = svsub_f32_x(svptrue_b32(), v303, v159); + svfloat32_t v306 = svadd_f32_x(svptrue_b32(), v284, v294); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v284, v294); + svfloat32_t v305 = svadd_f32_x(svptrue_b32(), v292, v304); + svfloat32_t v307 = svadd_f32_x(svptrue_b32(), v286, v296); + svfloat32_t v308 = svsub_f32_x(svptrue_b32(), v288, v298); + svfloat32_t v309 = svadd_f32_x(svptrue_b32(), v290, v300); + svfloat32_t v310 = svsub_f32_x(svptrue_b32(), v290, v300); + svfloat32_t v311 = svadd_f32_x(svptrue_b32(), v288, v298); + svfloat32_t v312 = svsub_f32_x(svptrue_b32(), v286, v296); + svfloat32_t v314 = svsub_f32_x(svptrue_b32(), v292, v304); + svint16_t v333 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v306, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v389 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v313, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v325 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v305, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v341 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v307, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v349 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v308, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v357 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v309, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v365 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v310, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v373 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v311, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v381 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v312, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v397 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v314, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v549), svreinterpret_u64_s16(v333)); + svst1w_u64(pred_full, (unsigned *)(v612), svreinterpret_u64_s16(v389)); + svst1w_u64(pred_full, (unsigned *)(v540), svreinterpret_u64_s16(v325)); + svst1w_u64(pred_full, (unsigned *)(v558), svreinterpret_u64_s16(v341)); + svst1w_u64(pred_full, (unsigned *)(v567), svreinterpret_u64_s16(v349)); + svst1w_u64(pred_full, (unsigned *)(v576), svreinterpret_u64_s16(v357)); + svst1w_u64(pred_full, (unsigned *)(v585), svreinterpret_u64_s16(v365)); + svst1w_u64(pred_full, (unsigned *)(v594), svreinterpret_u64_s16(v373)); + svst1w_u64(pred_full, (unsigned *)(v603), svreinterpret_u64_s16(v381)); + svst1w_u64(pred_full, (unsigned *)(v621), svreinterpret_u64_s16(v397)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu12(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v83 = vld1s_s16(&v5[istride]); + float v118 = 1.0000000000000000e+00F; + float v119 = -1.0000000000000000e+00F; + float v145 = -1.4999999999999998e+00F; + float v146 = 1.4999999999999998e+00F; + float v174 = 8.6602540378443871e-01F; + float32x2_t v177 = (float32x2_t){v4, v4}; + float v182 = -8.6602540378443871e-01F; + int16x4_t v34 = vld1s_s16(&v5[0]); + float32x2_t v84 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v83)), 15); + float32x2_t v120 = (float32x2_t){v118, v119}; + float32x2_t v143 = (float32x2_t){v145, v145}; + float32x2_t v147 = (float32x2_t){v145, v146}; + float32x2_t v176 = (float32x2_t){v174, v182}; + float32x2_t v183 = (float32x2_t){v182, v182}; + int16x4_t v20 = vld1s_s16(&v5[istride * 4]); + int16x4_t v26 = vld1s_s16(&v5[istride * 8]); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + int16x4_t v41 = vld1s_s16(&v5[istride * 7]); + int16x4_t v47 = vld1s_s16(&v5[istride * 11]); + int16x4_t v55 = vld1s_s16(&v5[istride * 3]); + int16x4_t v62 = vld1s_s16(&v5[istride * 10]); + int16x4_t v68 = vld1s_s16(&v5[istride * 2]); + int16x4_t v76 = vld1s_s16(&v5[istride * 6]); + int16x4_t v89 = vld1s_s16(&v5[istride * 5]); + int16x4_t v97 = vld1s_s16(&v5[istride * 9]); + float32x2_t v122 = vmul_f32(v177, v120); + float32x2_t v149 = vmul_f32(v177, v147); + float32x2_t v178 = vmul_f32(v177, v176); + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v42 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v41)), 15); + float32x2_t v48 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v47)), 15); + float32x2_t v56 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v55)), 15); + float32x2_t v63 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v62)), 15); + float32x2_t v69 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v68)), 15); + float32x2_t v77 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v76)), 15); + float32x2_t v90 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v89)), 15); + float32x2_t v98 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v97)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v49 = vadd_f32(v42, v48); + float32x2_t v50 = vsub_f32(v42, v48); + float32x2_t v70 = vadd_f32(v63, v69); + float32x2_t v71 = vsub_f32(v63, v69); + float32x2_t v91 = vadd_f32(v84, v90); + float32x2_t v92 = vsub_f32(v84, v90); + float32x2_t v36 = vadd_f32(v28, v35); + float32x2_t v57 = vadd_f32(v49, v56); + float32x2_t v78 = vadd_f32(v70, v77); + float32x2_t v99 = vadd_f32(v91, v98); + float32x2_t v127 = vadd_f32(v28, v70); + float32x2_t v128 = vsub_f32(v28, v70); + float32x2_t v129 = vadd_f32(v49, v91); + float32x2_t v130 = vsub_f32(v49, v91); + float32x2_t v154 = vadd_f32(v29, v71); + float32x2_t v155 = vsub_f32(v29, v71); + float32x2_t v156 = vadd_f32(v50, v92); + float32x2_t v157 = vsub_f32(v50, v92); + float32x2_t v100 = vadd_f32(v36, v78); + float32x2_t v101 = vsub_f32(v36, v78); + float32x2_t v102 = vadd_f32(v57, v99); + float32x2_t v103 = vsub_f32(v57, v99); + float32x2_t v131 = vadd_f32(v127, v129); + float32x2_t v132 = vsub_f32(v127, v129); + float32x2_t v144 = vmul_f32(v128, v143); + float32x2_t v150 = vrev64_f32(v130); + float32x2_t v158 = vadd_f32(v154, v156); + float32x2_t v159 = vsub_f32(v154, v156); + float32x2_t v179 = vrev64_f32(v155); + float32x2_t v184 = vmul_f32(v157, v183); + float32x2_t v104 = vadd_f32(v100, v102); + float32x2_t v105 = vsub_f32(v100, v102); + float32x2_t v123 = vrev64_f32(v103); + float32x2_t v136 = vmul_f32(v131, v143); + float32x2_t v140 = vmul_f32(v132, v143); + float32x2_t v151 = vmul_f32(v150, v149); + float32x2_t v165 = vrev64_f32(v158); + float32x2_t v172 = vrev64_f32(v159); + float32x2_t v180 = vmul_f32(v179, v178); + float32x2_t v124 = vmul_f32(v123, v122); + float32x2_t v152 = vadd_f32(v144, v151); + float32x2_t v153 = vsub_f32(v144, v151); + float32x2_t v166 = vmul_f32(v165, v178); + float32x2_t v173 = vmul_f32(v172, v178); + float32x2_t v185 = vadd_f32(v180, v184); + float32x2_t v186 = vsub_f32(v180, v184); + float32x2_t v187 = vadd_f32(v104, v136); + int16x4_t v192 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v104, 15), (int32x2_t){0, 0})); + float32x2_t v229 = vadd_f32(v105, v140); + int16x4_t v234 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v105, 15), (int32x2_t){0, 0})); + float32x2_t v125 = vadd_f32(v101, v124); + float32x2_t v126 = vsub_f32(v101, v124); + float32x2_t v188 = vadd_f32(v187, v166); + float32x2_t v189 = vsub_f32(v187, v166); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v192), 0); + float32x2_t v230 = vadd_f32(v229, v173); + float32x2_t v231 = vsub_f32(v229, v173); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v234), 0); + int16x4_t v198 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v189, 15), (int32x2_t){0, 0})); + int16x4_t v204 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v188, 15), (int32x2_t){0, 0})); + float32x2_t v208 = vadd_f32(v126, v153); + int16x4_t v213 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v126, 15), (int32x2_t){0, 0})); + int16x4_t v240 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v231, 15), (int32x2_t){0, 0})); + int16x4_t v246 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v230, 15), (int32x2_t){0, 0})); + float32x2_t v250 = vadd_f32(v125, v152); + int16x4_t v255 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v125, 15), (int32x2_t){0, 0})); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v198), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v204), 0); + float32x2_t v209 = vadd_f32(v208, v186); + float32x2_t v210 = vsub_f32(v208, v186); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v213), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v240), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v246), 0); + float32x2_t v251 = vadd_f32(v250, v185); + float32x2_t v252 = vsub_f32(v250, v185); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v255), 0); + int16x4_t v219 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v210, 15), (int32x2_t){0, 0})); + int16x4_t v225 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v209, 15), (int32x2_t){0, 0})); + int16x4_t v261 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v252, 15), (int32x2_t){0, 0})); + int16x4_t v267 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v251, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v219), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v225), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v261), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v267), 0); + v5 += 1 * 1; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu12(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * 1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v148 = -1.0000000000000000e+00F; + float v173 = -1.4999999999999998e+00F; + float v178 = 1.4999999999999998e+00F; + float v214 = -8.6602540378443871e-01F; + const int32_t *v416 = &v5[v0]; + int32_t *v492 = &v6[v2]; + int64_t v19 = v0 * 4; + int64_t v27 = v0 * 8; + int64_t v46 = v0 * 7; + int64_t v54 = v0 * 11; + int64_t v64 = v0 * 3; + int64_t v73 = v0 * 10; + int64_t v81 = v0 * 2; + int64_t v91 = v0 * 6; + int64_t v108 = v0 * 5; + int64_t v118 = v0 * 9; + float v151 = v4 * v148; + float v181 = v4 * v178; + float v210 = v4 * v214; + int64_t v232 = v2 * 4; + int64_t v240 = v2 * 8; + int64_t v251 = v2 * 9; + int64_t v267 = v2 * 5; + int64_t v278 = v2 * 6; + int64_t v286 = v2 * 10; + int64_t v294 = v2 * 2; + int64_t v305 = v2 * 3; + int64_t v313 = v2 * 7; + int64_t v321 = v2 * 11; + const int32_t *v353 = &v5[0]; + svfloat32_t v443 = svdup_n_f32(v173); + svfloat32_t v448 = svdup_n_f32(v214); + int32_t *v456 = &v6[0]; + svfloat32_t v106 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v416[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v334 = &v5[v19]; + const int32_t *v343 = &v5[v27]; + const int32_t *v362 = &v5[v46]; + const int32_t *v371 = &v5[v54]; + const int32_t *v380 = &v5[v64]; + const int32_t *v389 = &v5[v73]; + const int32_t *v398 = &v5[v81]; + const int32_t *v407 = &v5[v91]; + const int32_t *v425 = &v5[v108]; + const int32_t *v434 = &v5[v118]; + svfloat32_t v440 = svdup_n_f32(v151); + svfloat32_t v444 = svdup_n_f32(v181); + svfloat32_t v447 = svdup_n_f32(v210); + int32_t *v465 = &v6[v232]; + int32_t *v474 = &v6[v240]; + int32_t *v483 = &v6[v251]; + int32_t *v501 = &v6[v267]; + int32_t *v510 = &v6[v278]; + int32_t *v519 = &v6[v286]; + int32_t *v528 = &v6[v294]; + int32_t *v537 = &v6[v305]; + int32_t *v546 = &v6[v313]; + int32_t *v555 = &v6[v321]; + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v353[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v334[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v343[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v52 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v362[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v60 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v371[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v70 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v380[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v79 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v389[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v87 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v398[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v97 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v407[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v114 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v425[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v124 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v434[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v61 = svadd_f32_x(svptrue_b32(), v52, v60); + svfloat32_t v62 = svsub_f32_x(svptrue_b32(), v52, v60); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v115 = svadd_f32_x(svptrue_b32(), v106, v114); + svfloat32_t v116 = svsub_f32_x(svptrue_b32(), v106, v114); + svfloat32_t v44 = svadd_f32_x(svptrue_b32(), v34, v43); + svfloat32_t v71 = svadd_f32_x(svptrue_b32(), v61, v70); + svfloat32_t v98 = svadd_f32_x(svptrue_b32(), v88, v97); + svfloat32_t v125 = svadd_f32_x(svptrue_b32(), v115, v124); + svfloat32_t v156 = svadd_f32_x(svptrue_b32(), v34, v88); + svfloat32_t v157 = svsub_f32_x(svptrue_b32(), v34, v88); + svfloat32_t v158 = svadd_f32_x(svptrue_b32(), v61, v115); + svfloat32_t v159 = svsub_f32_x(svptrue_b32(), v61, v115); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v35, v89); + svfloat32_t v187 = svsub_f32_x(svptrue_b32(), v35, v89); + svfloat32_t v188 = svadd_f32_x(svptrue_b32(), v62, v116); + svfloat32_t v189 = svsub_f32_x(svptrue_b32(), v62, v116); + svfloat32_t v126 = svadd_f32_x(svptrue_b32(), v44, v98); + svfloat32_t v127 = svsub_f32_x(svptrue_b32(), v44, v98); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v71, v125); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v71, v125); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v156, v158); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v156, v158); + svfloat32_t zero183 = svdup_n_f32(0); + svfloat32_t v183 = svcmla_f32_x(pred_full, zero183, v444, v159, 90); + svfloat32_t v190 = svadd_f32_x(svptrue_b32(), v186, v188); + svfloat32_t v191 = svsub_f32_x(svptrue_b32(), v186, v188); + svfloat32_t zero212 = svdup_n_f32(0); + svfloat32_t v212 = svcmla_f32_x(pred_full, zero212, v447, v187, 90); + svfloat32_t v130 = svadd_f32_x(svptrue_b32(), v126, v128); + svfloat32_t v131 = svsub_f32_x(svptrue_b32(), v126, v128); + svfloat32_t zero153 = svdup_n_f32(0); + svfloat32_t v153 = svcmla_f32_x(pred_full, zero153, v440, v129, 90); + svfloat32_t v184 = svmla_f32_x(pred_full, v183, v157, v443); + svfloat32_t v185 = svnmls_f32_x(pred_full, v183, v157, v443); + svfloat32_t zero198 = svdup_n_f32(0); + svfloat32_t v198 = svcmla_f32_x(pred_full, zero198, v447, v190, 90); + svfloat32_t zero205 = svdup_n_f32(0); + svfloat32_t v205 = svcmla_f32_x(pred_full, zero205, v447, v191, 90); + svfloat32_t v218 = svmla_f32_x(pred_full, v212, v189, v448); + svfloat32_t v219 = svmls_f32_x(pred_full, v212, v189, v448); + svfloat32_t v154 = svadd_f32_x(svptrue_b32(), v127, v153); + svfloat32_t v155 = svsub_f32_x(svptrue_b32(), v127, v153); + svfloat32_t v220 = svmla_f32_x(pred_full, v130, v160, v443); + svint16_t v225 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v130, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v274 = svmla_f32_x(pred_full, v131, v161, v443); + svint16_t v279 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v131, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v221 = svadd_f32_x(svptrue_b32(), v220, v198); + svfloat32_t v222 = svsub_f32_x(svptrue_b32(), v220, v198); + svfloat32_t v247 = svadd_f32_x(svptrue_b32(), v155, v185); + svint16_t v252 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v155, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v275 = svadd_f32_x(svptrue_b32(), v274, v205); + svfloat32_t v276 = svsub_f32_x(svptrue_b32(), v274, v205); + svfloat32_t v301 = svadd_f32_x(svptrue_b32(), v154, v184); + svint16_t v306 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v154, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v456), svreinterpret_u64_s16(v225)); + svst1w_u64(pred_full, (unsigned *)(v510), svreinterpret_u64_s16(v279)); + svint16_t v233 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v222, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v241 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v221, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v248 = svadd_f32_x(svptrue_b32(), v247, v219); + svfloat32_t v249 = svsub_f32_x(svptrue_b32(), v247, v219); + svint16_t v287 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v276, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v295 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v275, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v301, v218); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v301, v218); + svst1w_u64(pred_full, (unsigned *)(v483), svreinterpret_u64_s16(v252)); + svst1w_u64(pred_full, (unsigned *)(v537), svreinterpret_u64_s16(v306)); + svint16_t v260 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v249, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v268 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v248, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v314 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v303, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v322 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v302, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v465), svreinterpret_u64_s16(v233)); + svst1w_u64(pred_full, (unsigned *)(v474), svreinterpret_u64_s16(v241)); + svst1w_u64(pred_full, (unsigned *)(v519), svreinterpret_u64_s16(v287)); + svst1w_u64(pred_full, (unsigned *)(v528), svreinterpret_u64_s16(v295)); + svst1w_u64(pred_full, (unsigned *)(v492), svreinterpret_u64_s16(v260)); + svst1w_u64(pred_full, (unsigned *)(v501), svreinterpret_u64_s16(v268)); + svst1w_u64(pred_full, (unsigned *)(v546), svreinterpret_u64_s16(v314)); + svst1w_u64(pred_full, (unsigned *)(v555), svreinterpret_u64_s16(v322)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu13(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v20 = vld1s_s16(&v5[istride]); + float v142 = 1.0833333333333333e+00F; + float v146 = -3.0046260628866578e-01F; + float v149 = 7.4927933062613905e-01F; + float v150 = -7.4927933062613905e-01F; + float v156 = 4.0100212832186721e-01F; + float v157 = -4.0100212832186721e-01F; + float v163 = 5.7514072947400308e-01F; + float v164 = -5.7514072947400308e-01F; + float v171 = 5.2422663952658211e-01F; + float v175 = 5.1652078062348972e-01F; + float v179 = 7.7058589030924258e-03F; + float v183 = 4.2763404682656941e-01F; + float v187 = 1.5180597207438440e-01F; + float v191 = 5.7944001890096386e-01F; + float v194 = 1.1543953381323635e+00F; + float v195 = -1.1543953381323635e+00F; + float v201 = 9.0655220171271012e-01F; + float v202 = -9.0655220171271012e-01F; + float v208 = 8.1857027294591811e-01F; + float v209 = -8.1857027294591811e-01F; + float v215 = 1.1971367726043427e+00F; + float v216 = -1.1971367726043427e+00F; + float v222 = 8.6131170741789742e-01F; + float v223 = -8.6131170741789742e-01F; + float v229 = 1.1091548438375507e+00F; + float v230 = -1.1091548438375507e+00F; + float v236 = 4.2741434471979367e-02F; + float v237 = -4.2741434471979367e-02F; + float v243 = -4.5240494294812715e-02F; + float v244 = 4.5240494294812715e-02F; + float v250 = 2.9058457089163264e-01F; + float v251 = -2.9058457089163264e-01F; + float32x2_t v253 = (float32x2_t){v4, v4}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v127 = vld1s_s16(&v5[0]); + float32x2_t v143 = (float32x2_t){v142, v142}; + float32x2_t v147 = (float32x2_t){v146, v146}; + float32x2_t v151 = (float32x2_t){v149, v150}; + float32x2_t v158 = (float32x2_t){v156, v157}; + float32x2_t v165 = (float32x2_t){v163, v164}; + float32x2_t v172 = (float32x2_t){v171, v171}; + float32x2_t v176 = (float32x2_t){v175, v175}; + float32x2_t v180 = (float32x2_t){v179, v179}; + float32x2_t v184 = (float32x2_t){v183, v183}; + float32x2_t v188 = (float32x2_t){v187, v187}; + float32x2_t v192 = (float32x2_t){v191, v191}; + float32x2_t v196 = (float32x2_t){v194, v195}; + float32x2_t v203 = (float32x2_t){v201, v202}; + float32x2_t v210 = (float32x2_t){v208, v209}; + float32x2_t v217 = (float32x2_t){v215, v216}; + float32x2_t v224 = (float32x2_t){v222, v223}; + float32x2_t v231 = (float32x2_t){v229, v230}; + float32x2_t v238 = (float32x2_t){v236, v237}; + float32x2_t v245 = (float32x2_t){v243, v244}; + float32x2_t v252 = (float32x2_t){v250, v251}; + int16x4_t v26 = vld1s_s16(&v5[istride * 12]); + int16x4_t v33 = vld1s_s16(&v5[istride * 2]); + int16x4_t v39 = vld1s_s16(&v5[istride * 11]); + int16x4_t v46 = vld1s_s16(&v5[istride * 3]); + int16x4_t v52 = vld1s_s16(&v5[istride * 10]); + int16x4_t v59 = vld1s_s16(&v5[istride * 4]); + int16x4_t v65 = vld1s_s16(&v5[istride * 9]); + int16x4_t v72 = vld1s_s16(&v5[istride * 5]); + int16x4_t v78 = vld1s_s16(&v5[istride * 8]); + int16x4_t v85 = vld1s_s16(&v5[istride * 6]); + int16x4_t v91 = vld1s_s16(&v5[istride * 7]); + float32x2_t v128 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v127)), 15); + float32x2_t v153 = vmul_f32(v253, v151); + float32x2_t v160 = vmul_f32(v253, v158); + float32x2_t v167 = vmul_f32(v253, v165); + float32x2_t v198 = vmul_f32(v253, v196); + float32x2_t v205 = vmul_f32(v253, v203); + float32x2_t v212 = vmul_f32(v253, v210); + float32x2_t v219 = vmul_f32(v253, v217); + float32x2_t v226 = vmul_f32(v253, v224); + float32x2_t v233 = vmul_f32(v253, v231); + float32x2_t v240 = vmul_f32(v253, v238); + float32x2_t v247 = vmul_f32(v253, v245); + float32x2_t v254 = vmul_f32(v253, v252); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v34 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v33)), 15); + float32x2_t v40 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v39)), 15); + float32x2_t v47 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v46)), 15); + float32x2_t v53 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v52)), 15); + float32x2_t v60 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v59)), 15); + float32x2_t v66 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v65)), 15); + float32x2_t v73 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v72)), 15); + float32x2_t v79 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v78)), 15); + float32x2_t v86 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v85)), 15); + float32x2_t v92 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v91)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v41 = vadd_f32(v34, v40); + float32x2_t v54 = vadd_f32(v47, v53); + float32x2_t v67 = vadd_f32(v60, v66); + float32x2_t v80 = vadd_f32(v73, v79); + float32x2_t v93 = vadd_f32(v86, v92); + float32x2_t v94 = vsub_f32(v21, v27); + float32x2_t v95 = vsub_f32(v34, v40); + float32x2_t v96 = vsub_f32(v47, v53); + float32x2_t v97 = vsub_f32(v60, v66); + float32x2_t v98 = vsub_f32(v73, v79); + float32x2_t v99 = vsub_f32(v86, v92); + float32x2_t v100 = vadd_f32(v41, v80); + float32x2_t v102 = vadd_f32(v28, v54); + float32x2_t v105 = vadd_f32(v95, v98); + float32x2_t v107 = vadd_f32(v94, v96); + float32x2_t v109 = vsub_f32(v41, v93); + float32x2_t v110 = vsub_f32(v54, v67); + float32x2_t v111 = vsub_f32(v28, v67); + float32x2_t v112 = vsub_f32(v80, v93); + float32x2_t v117 = vsub_f32(v95, v99); + float32x2_t v118 = vsub_f32(v94, v96); + float32x2_t v119 = vsub_f32(v95, v98); + float32x2_t v120 = vadd_f32(v94, v97); + float32x2_t v121 = vsub_f32(v98, v99); + float32x2_t v122 = vadd_f32(v96, v97); + float32x2_t v101 = vadd_f32(v100, v93); + float32x2_t v103 = vadd_f32(v102, v67); + float32x2_t v106 = vadd_f32(v105, v99); + float32x2_t v108 = vsub_f32(v107, v97); + float32x2_t v113 = vsub_f32(v109, v110); + float32x2_t v114 = vsub_f32(v111, v112); + float32x2_t v115 = vadd_f32(v109, v110); + float32x2_t v116 = vadd_f32(v111, v112); + float32x2_t v134 = vadd_f32(v117, v118); + float32x2_t v135 = vadd_f32(v119, v120); + float32x2_t v136 = vsub_f32(v121, v122); + float32x2_t v199 = vrev64_f32(v117); + float32x2_t v206 = vrev64_f32(v118); + float32x2_t v220 = vrev64_f32(v119); + float32x2_t v227 = vrev64_f32(v120); + float32x2_t v241 = vrev64_f32(v121); + float32x2_t v248 = vrev64_f32(v122); + float32x2_t v104 = vadd_f32(v101, v103); + float32x2_t v130 = vsub_f32(v103, v101); + float32x2_t v131 = vadd_f32(v106, v108); + float32x2_t v132 = vadd_f32(v113, v114); + float32x2_t v133 = vsub_f32(v115, v116); + float32x2_t v154 = vrev64_f32(v106); + float32x2_t v161 = vrev64_f32(v108); + float32x2_t v173 = vmul_f32(v113, v172); + float32x2_t v177 = vmul_f32(v114, v176); + float32x2_t v185 = vmul_f32(v115, v184); + float32x2_t v189 = vmul_f32(v116, v188); + float32x2_t v200 = vmul_f32(v199, v198); + float32x2_t v207 = vmul_f32(v206, v205); + float32x2_t v213 = vrev64_f32(v134); + float32x2_t v221 = vmul_f32(v220, v219); + float32x2_t v228 = vmul_f32(v227, v226); + float32x2_t v234 = vrev64_f32(v135); + float32x2_t v242 = vmul_f32(v241, v240); + float32x2_t v249 = vmul_f32(v248, v247); + float32x2_t v255 = vrev64_f32(v136); + float32x2_t v129 = vadd_f32(v128, v104); + float32x2_t v144 = vmul_f32(v104, v143); + float32x2_t v148 = vmul_f32(v130, v147); + float32x2_t v155 = vmul_f32(v154, v153); + float32x2_t v162 = vmul_f32(v161, v160); + float32x2_t v168 = vrev64_f32(v131); + float32x2_t v181 = vmul_f32(v132, v180); + float32x2_t v193 = vmul_f32(v133, v192); + float32x2_t v214 = vmul_f32(v213, v212); + float32x2_t v235 = vmul_f32(v234, v233); + float32x2_t v256 = vmul_f32(v255, v254); + float32x2_t v258 = vadd_f32(v177, v173); + float32x2_t v169 = vmul_f32(v168, v167); + float32x2_t v257 = vsub_f32(v129, v144); + float32x2_t v259 = vsub_f32(v258, v148); + float32x2_t v260 = vadd_f32(v177, v181); + float32x2_t v262 = vsub_f32(v181, v173); + float32x2_t v270 = vsub_f32(v200, v214); + float32x2_t v271 = vsub_f32(v207, v214); + float32x2_t v272 = vsub_f32(v221, v235); + float32x2_t v273 = vsub_f32(v228, v235); + float32x2_t v274 = vsub_f32(v242, v256); + float32x2_t v275 = vadd_f32(v249, v256); + int16x4_t v310 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v129, 15), (int32x2_t){0, 0})); + float32x2_t v261 = vadd_f32(v260, v148); + float32x2_t v263 = vsub_f32(v262, v148); + float32x2_t v264 = vadd_f32(v257, v185); + float32x2_t v266 = vsub_f32(v257, v189); + float32x2_t v268 = vsub_f32(v257, v185); + float32x2_t v276 = vsub_f32(v155, v169); + float32x2_t v277 = vsub_f32(v162, v169); + float32x2_t v288 = vadd_f32(v270, v274); + float32x2_t v290 = vadd_f32(v272, v274); + float32x2_t v292 = vsub_f32(v271, v275); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v310), 0); + float32x2_t v265 = vadd_f32(v264, v189); + float32x2_t v267 = vsub_f32(v266, v193); + float32x2_t v269 = vadd_f32(v268, v193); + float32x2_t v284 = vsub_f32(v277, v270); + float32x2_t v286 = vsub_f32(v275, v276); + float32x2_t v289 = vadd_f32(v288, v277); + float32x2_t v291 = vsub_f32(v290, v277); + float32x2_t v293 = vsub_f32(v292, v276); + float32x2_t v294 = vadd_f32(v276, v271); + float32x2_t v278 = vadd_f32(v259, v265); + float32x2_t v279 = vadd_f32(v261, v267); + float32x2_t v280 = vsub_f32(v267, v261); + float32x2_t v281 = vadd_f32(v263, v269); + float32x2_t v282 = vsub_f32(v265, v259); + float32x2_t v283 = vsub_f32(v269, v263); + float32x2_t v285 = vadd_f32(v284, v272); + float32x2_t v287 = vsub_f32(v286, v273); + float32x2_t v295 = vsub_f32(v294, v273); + float32x2_t v296 = vsub_f32(v278, v285); + float32x2_t v297 = vadd_f32(v279, v287); + float32x2_t v298 = vsub_f32(v280, v289); + float32x2_t v299 = vsub_f32(v281, v291); + float32x2_t v300 = vadd_f32(v282, v293); + float32x2_t v301 = vsub_f32(v283, v295); + float32x2_t v302 = vadd_f32(v283, v295); + float32x2_t v303 = vsub_f32(v282, v293); + float32x2_t v304 = vadd_f32(v281, v291); + float32x2_t v305 = vadd_f32(v280, v289); + float32x2_t v306 = vsub_f32(v279, v287); + float32x2_t v307 = vadd_f32(v278, v285); + int16x4_t v316 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v296, 15), (int32x2_t){0, 0})); + int16x4_t v322 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v297, 15), (int32x2_t){0, 0})); + int16x4_t v328 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v298, 15), (int32x2_t){0, 0})); + int16x4_t v334 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v299, 15), (int32x2_t){0, 0})); + int16x4_t v340 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v300, 15), (int32x2_t){0, 0})); + int16x4_t v346 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v301, 15), (int32x2_t){0, 0})); + int16x4_t v352 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v302, 15), (int32x2_t){0, 0})); + int16x4_t v358 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v303, 15), (int32x2_t){0, 0})); + int16x4_t v364 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v304, 15), (int32x2_t){0, 0})); + int16x4_t v370 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v305, 15), (int32x2_t){0, 0})); + int16x4_t v376 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v306, 15), (int32x2_t){0, 0})); + int16x4_t v382 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v307, 15), (int32x2_t){0, 0})); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v316), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v322), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v328), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v334), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v340), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v346), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v352), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v358), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v364), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v370), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v376), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v382), 0); + v5 += 1 * 1; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu13(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * 1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v171 = 1.0833333333333333e+00F; + float v176 = -3.0046260628866578e-01F; + float v181 = -7.4927933062613905e-01F; + float v188 = -4.0100212832186721e-01F; + float v195 = -5.7514072947400308e-01F; + float v202 = 5.2422663952658211e-01F; + float v207 = 5.1652078062348972e-01F; + float v212 = 7.7058589030924258e-03F; + float v217 = 4.2763404682656941e-01F; + float v222 = 1.5180597207438440e-01F; + float v227 = 5.7944001890096386e-01F; + float v232 = -1.1543953381323635e+00F; + float v239 = -9.0655220171271012e-01F; + float v246 = -8.1857027294591811e-01F; + float v253 = -1.1971367726043427e+00F; + float v260 = -8.6131170741789742e-01F; + float v267 = -1.1091548438375507e+00F; + float v274 = -4.2741434471979367e-02F; + float v281 = 4.5240494294812715e-02F; + float v288 = -2.9058457089163264e-01F; + const int32_t *v455 = &v5[v0]; + int32_t *v703 = &v6[v2]; + int64_t v27 = v0 * 12; + int64_t v36 = v0 * 2; + int64_t v44 = v0 * 11; + int64_t v53 = v0 * 3; + int64_t v61 = v0 * 10; + int64_t v70 = v0 * 4; + int64_t v78 = v0 * 9; + int64_t v87 = v0 * 5; + int64_t v95 = v0 * 8; + int64_t v104 = v0 * 6; + int64_t v112 = v0 * 7; + float v184 = v4 * v181; + float v191 = v4 * v188; + float v198 = v4 * v195; + float v235 = v4 * v232; + float v242 = v4 * v239; + float v249 = v4 * v246; + float v256 = v4 * v253; + float v263 = v4 * v260; + float v270 = v4 * v267; + float v277 = v4 * v274; + float v284 = v4 * v281; + float v291 = v4 * v288; + int64_t v354 = v2 * 12; + int64_t v362 = v2 * 11; + int64_t v370 = v2 * 10; + int64_t v378 = v2 * 9; + int64_t v386 = v2 * 8; + int64_t v394 = v2 * 7; + int64_t v402 = v2 * 6; + int64_t v410 = v2 * 5; + int64_t v418 = v2 * 4; + int64_t v426 = v2 * 3; + int64_t v434 = v2 * 2; + const int32_t *v564 = &v5[0]; + svfloat32_t v568 = svdup_n_f32(v171); + svfloat32_t v569 = svdup_n_f32(v176); + svfloat32_t v573 = svdup_n_f32(v202); + svfloat32_t v574 = svdup_n_f32(v207); + svfloat32_t v575 = svdup_n_f32(v212); + svfloat32_t v576 = svdup_n_f32(v217); + svfloat32_t v577 = svdup_n_f32(v222); + svfloat32_t v578 = svdup_n_f32(v227); + int32_t *v595 = &v6[0]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v455[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v464 = &v5[v27]; + const int32_t *v473 = &v5[v36]; + const int32_t *v482 = &v5[v44]; + const int32_t *v491 = &v5[v53]; + const int32_t *v500 = &v5[v61]; + const int32_t *v509 = &v5[v70]; + const int32_t *v518 = &v5[v78]; + const int32_t *v527 = &v5[v87]; + const int32_t *v536 = &v5[v95]; + const int32_t *v545 = &v5[v104]; + const int32_t *v554 = &v5[v112]; + svfloat32_t v570 = svdup_n_f32(v184); + svfloat32_t v571 = svdup_n_f32(v191); + svfloat32_t v572 = svdup_n_f32(v198); + svfloat32_t v579 = svdup_n_f32(v235); + svfloat32_t v580 = svdup_n_f32(v242); + svfloat32_t v581 = svdup_n_f32(v249); + svfloat32_t v582 = svdup_n_f32(v256); + svfloat32_t v583 = svdup_n_f32(v263); + svfloat32_t v584 = svdup_n_f32(v270); + svfloat32_t v585 = svdup_n_f32(v277); + svfloat32_t v586 = svdup_n_f32(v284); + svfloat32_t v587 = svdup_n_f32(v291); + int32_t *v604 = &v6[v354]; + int32_t *v613 = &v6[v362]; + int32_t *v622 = &v6[v370]; + int32_t *v631 = &v6[v378]; + int32_t *v640 = &v6[v386]; + int32_t *v649 = &v6[v394]; + int32_t *v658 = &v6[v402]; + int32_t *v667 = &v6[v410]; + int32_t *v676 = &v6[v418]; + int32_t *v685 = &v6[v426]; + int32_t *v694 = &v6[v434]; + svfloat32_t v156 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v564[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v464[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v42 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v473[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v50 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v482[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v59 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v491[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v67 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v500[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v76 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v509[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v84 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v518[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v93 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v527[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v101 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v536[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v110 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v545[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v118 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v554[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v51 = svadd_f32_x(svptrue_b32(), v42, v50); + svfloat32_t v68 = svadd_f32_x(svptrue_b32(), v59, v67); + svfloat32_t v85 = svadd_f32_x(svptrue_b32(), v76, v84); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v93, v101); + svfloat32_t v119 = svadd_f32_x(svptrue_b32(), v110, v118); + svfloat32_t v120 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v121 = svsub_f32_x(svptrue_b32(), v42, v50); + svfloat32_t v122 = svsub_f32_x(svptrue_b32(), v59, v67); + svfloat32_t v123 = svsub_f32_x(svptrue_b32(), v76, v84); + svfloat32_t v124 = svsub_f32_x(svptrue_b32(), v93, v101); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v110, v118); + svfloat32_t v126 = svadd_f32_x(svptrue_b32(), v51, v102); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v34, v68); + svfloat32_t v131 = svadd_f32_x(svptrue_b32(), v121, v124); + svfloat32_t v133 = svadd_f32_x(svptrue_b32(), v120, v122); + svfloat32_t v135 = svsub_f32_x(svptrue_b32(), v51, v119); + svfloat32_t v136 = svsub_f32_x(svptrue_b32(), v68, v85); + svfloat32_t v137 = svsub_f32_x(svptrue_b32(), v34, v85); + svfloat32_t v138 = svsub_f32_x(svptrue_b32(), v102, v119); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v121, v125); + svfloat32_t v144 = svsub_f32_x(svptrue_b32(), v120, v122); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v121, v124); + svfloat32_t v146 = svadd_f32_x(svptrue_b32(), v120, v123); + svfloat32_t v147 = svsub_f32_x(svptrue_b32(), v124, v125); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v122, v123); + svfloat32_t v127 = svadd_f32_x(svptrue_b32(), v126, v119); + svfloat32_t v129 = svadd_f32_x(svptrue_b32(), v128, v85); + svfloat32_t v132 = svadd_f32_x(svptrue_b32(), v131, v125); + svfloat32_t v134 = svsub_f32_x(svptrue_b32(), v133, v123); + svfloat32_t v139 = svsub_f32_x(svptrue_b32(), v135, v136); + svfloat32_t v140 = svsub_f32_x(svptrue_b32(), v137, v138); + svfloat32_t v141 = svadd_f32_x(svptrue_b32(), v135, v136); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v137, v138); + svfloat32_t v162 = svadd_f32_x(svptrue_b32(), v143, v144); + svfloat32_t v163 = svadd_f32_x(svptrue_b32(), v145, v146); + svfloat32_t v164 = svsub_f32_x(svptrue_b32(), v147, v148); + svfloat32_t zero237 = svdup_n_f32(0); + svfloat32_t v237 = svcmla_f32_x(pred_full, zero237, v579, v143, 90); + svfloat32_t zero244 = svdup_n_f32(0); + svfloat32_t v244 = svcmla_f32_x(pred_full, zero244, v580, v144, 90); + svfloat32_t zero258 = svdup_n_f32(0); + svfloat32_t v258 = svcmla_f32_x(pred_full, zero258, v582, v145, 90); + svfloat32_t zero265 = svdup_n_f32(0); + svfloat32_t v265 = svcmla_f32_x(pred_full, zero265, v583, v146, 90); + svfloat32_t zero279 = svdup_n_f32(0); + svfloat32_t v279 = svcmla_f32_x(pred_full, zero279, v585, v147, 90); + svfloat32_t v130 = svadd_f32_x(svptrue_b32(), v127, v129); + svfloat32_t v158 = svsub_f32_x(svptrue_b32(), v129, v127); + svfloat32_t v159 = svadd_f32_x(svptrue_b32(), v132, v134); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v139, v140); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v141, v142); + svfloat32_t zero186 = svdup_n_f32(0); + svfloat32_t v186 = svcmla_f32_x(pred_full, zero186, v570, v132, 90); + svfloat32_t zero193 = svdup_n_f32(0); + svfloat32_t v193 = svcmla_f32_x(pred_full, zero193, v571, v134, 90); + svfloat32_t v205 = svmul_f32_x(svptrue_b32(), v139, v573); + svfloat32_t zero251 = svdup_n_f32(0); + svfloat32_t v251 = svcmla_f32_x(pred_full, zero251, v581, v162, 90); + svfloat32_t zero272 = svdup_n_f32(0); + svfloat32_t v272 = svcmla_f32_x(pred_full, zero272, v584, v163, 90); + svfloat32_t zero293 = svdup_n_f32(0); + svfloat32_t v293 = svcmla_f32_x(pred_full, zero293, v587, v164, 90); + svfloat32_t v157 = svadd_f32_x(svptrue_b32(), v156, v130); + svfloat32_t zero200 = svdup_n_f32(0); + svfloat32_t v200 = svcmla_f32_x(pred_full, zero200, v572, v159, 90); + svfloat32_t v215 = svmul_f32_x(svptrue_b32(), v160, v575); + svfloat32_t v295 = svmla_f32_x(pred_full, v205, v140, v574); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v237, v251); + svfloat32_t v308 = svsub_f32_x(svptrue_b32(), v244, v251); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v258, v272); + svfloat32_t v310 = svsub_f32_x(svptrue_b32(), v265, v272); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v279, v293); + svfloat32_t v312 = svcmla_f32_x(pred_full, v293, v586, v148, 90); + svfloat32_t v294 = svmls_f32_x(pred_full, v157, v130, v568); + svfloat32_t v296 = svmls_f32_x(pred_full, v295, v158, v569); + svfloat32_t v297 = svmla_f32_x(pred_full, v215, v140, v574); + svfloat32_t v299 = svnmls_f32_x(pred_full, v205, v160, v575); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v186, v200); + svfloat32_t v314 = svsub_f32_x(svptrue_b32(), v193, v200); + svfloat32_t v325 = svadd_f32_x(svptrue_b32(), v307, v311); + svfloat32_t v327 = svadd_f32_x(svptrue_b32(), v309, v311); + svfloat32_t v329 = svsub_f32_x(svptrue_b32(), v308, v312); + svint16_t v347 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v157, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v298 = svmla_f32_x(pred_full, v297, v158, v569); + svfloat32_t v300 = svmls_f32_x(pred_full, v299, v158, v569); + svfloat32_t v301 = svmla_f32_x(pred_full, v294, v141, v576); + svfloat32_t v303 = svmls_f32_x(pred_full, v294, v142, v577); + svfloat32_t v305 = svmls_f32_x(pred_full, v294, v141, v576); + svfloat32_t v321 = svsub_f32_x(svptrue_b32(), v314, v307); + svfloat32_t v323 = svsub_f32_x(svptrue_b32(), v312, v313); + svfloat32_t v326 = svadd_f32_x(svptrue_b32(), v325, v314); + svfloat32_t v328 = svsub_f32_x(svptrue_b32(), v327, v314); + svfloat32_t v330 = svsub_f32_x(svptrue_b32(), v329, v313); + svfloat32_t v331 = svadd_f32_x(svptrue_b32(), v313, v308); + svst1w_u64(pred_full, (unsigned *)(v595), svreinterpret_u64_s16(v347)); + svfloat32_t v302 = svmla_f32_x(pred_full, v301, v142, v577); + svfloat32_t v304 = svmls_f32_x(pred_full, v303, v161, v578); + svfloat32_t v306 = svmla_f32_x(pred_full, v305, v161, v578); + svfloat32_t v322 = svadd_f32_x(svptrue_b32(), v321, v309); + svfloat32_t v324 = svsub_f32_x(svptrue_b32(), v323, v310); + svfloat32_t v332 = svsub_f32_x(svptrue_b32(), v331, v310); + svfloat32_t v315 = svadd_f32_x(svptrue_b32(), v296, v302); + svfloat32_t v316 = svadd_f32_x(svptrue_b32(), v298, v304); + svfloat32_t v317 = svsub_f32_x(svptrue_b32(), v304, v298); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v300, v306); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v302, v296); + svfloat32_t v320 = svsub_f32_x(svptrue_b32(), v306, v300); + svfloat32_t v333 = svsub_f32_x(svptrue_b32(), v315, v322); + svfloat32_t v334 = svadd_f32_x(svptrue_b32(), v316, v324); + svfloat32_t v335 = svsub_f32_x(svptrue_b32(), v317, v326); + svfloat32_t v336 = svsub_f32_x(svptrue_b32(), v318, v328); + svfloat32_t v337 = svadd_f32_x(svptrue_b32(), v319, v330); + svfloat32_t v338 = svsub_f32_x(svptrue_b32(), v320, v332); + svfloat32_t v339 = svadd_f32_x(svptrue_b32(), v320, v332); + svfloat32_t v340 = svsub_f32_x(svptrue_b32(), v319, v330); + svfloat32_t v341 = svadd_f32_x(svptrue_b32(), v318, v328); + svfloat32_t v342 = svadd_f32_x(svptrue_b32(), v317, v326); + svfloat32_t v343 = svsub_f32_x(svptrue_b32(), v316, v324); + svfloat32_t v344 = svadd_f32_x(svptrue_b32(), v315, v322); + svint16_t v355 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v333, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v363 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v334, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v371 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v335, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v379 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v336, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v387 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v337, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v395 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v338, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v403 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v339, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v411 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v340, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v419 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v341, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v427 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v342, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v435 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v343, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v443 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v344, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v604), svreinterpret_u64_s16(v355)); + svst1w_u64(pred_full, (unsigned *)(v613), svreinterpret_u64_s16(v363)); + svst1w_u64(pred_full, (unsigned *)(v622), svreinterpret_u64_s16(v371)); + svst1w_u64(pred_full, (unsigned *)(v631), svreinterpret_u64_s16(v379)); + svst1w_u64(pred_full, (unsigned *)(v640), svreinterpret_u64_s16(v387)); + svst1w_u64(pred_full, (unsigned *)(v649), svreinterpret_u64_s16(v395)); + svst1w_u64(pred_full, (unsigned *)(v658), svreinterpret_u64_s16(v403)); + svst1w_u64(pred_full, (unsigned *)(v667), svreinterpret_u64_s16(v411)); + svst1w_u64(pred_full, (unsigned *)(v676), svreinterpret_u64_s16(v419)); + svst1w_u64(pred_full, (unsigned *)(v685), svreinterpret_u64_s16(v427)); + svst1w_u64(pred_full, (unsigned *)(v694), svreinterpret_u64_s16(v435)); + svst1w_u64(pred_full, (unsigned *)(v703), svreinterpret_u64_s16(v443)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu14(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v82 = vld1s_s16(&v5[istride]); + float v220 = -1.1666666666666665e+00F; + float v224 = 7.9015646852540022e-01F; + float v228 = 5.5854267289647742e-02F; + float v232 = 7.3430220123575241e-01F; + float v235 = 4.4095855184409838e-01F; + float v236 = -4.4095855184409838e-01F; + float v242 = 3.4087293062393137e-01F; + float v243 = -3.4087293062393137e-01F; + float v249 = -5.3396936033772524e-01F; + float v250 = 5.3396936033772524e-01F; + float v256 = 8.7484229096165667e-01F; + float v257 = -8.7484229096165667e-01F; + float32x2_t v259 = (float32x2_t){v4, v4}; + int16x4_t v20 = vld1s_s16(&v5[0]); + float32x2_t v83 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v82)), 15); + float32x2_t v221 = (float32x2_t){v220, v220}; + float32x2_t v225 = (float32x2_t){v224, v224}; + float32x2_t v229 = (float32x2_t){v228, v228}; + float32x2_t v233 = (float32x2_t){v232, v232}; + float32x2_t v237 = (float32x2_t){v235, v236}; + float32x2_t v244 = (float32x2_t){v242, v243}; + float32x2_t v251 = (float32x2_t){v249, v250}; + float32x2_t v258 = (float32x2_t){v256, v257}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v26 = vld1s_s16(&v5[istride * 7]); + int16x4_t v34 = vld1s_s16(&v5[istride * 2]); + int16x4_t v40 = vld1s_s16(&v5[istride * 9]); + int16x4_t v48 = vld1s_s16(&v5[istride * 4]); + int16x4_t v54 = vld1s_s16(&v5[istride * 11]); + int16x4_t v62 = vld1s_s16(&v5[istride * 6]); + int16x4_t v68 = vld1s_s16(&v5[istride * 13]); + int16x4_t v76 = vld1s_s16(&v5[istride * 8]); + int16x4_t v90 = vld1s_s16(&v5[istride * 10]); + int16x4_t v96 = vld1s_s16(&v5[istride * 3]); + int16x4_t v104 = vld1s_s16(&v5[istride * 12]); + int16x4_t v110 = vld1s_s16(&v5[istride * 5]); + float32x2_t v239 = vmul_f32(v259, v237); + float32x2_t v246 = vmul_f32(v259, v244); + float32x2_t v253 = vmul_f32(v259, v251); + float32x2_t v260 = vmul_f32(v259, v258); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v49 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v48)), 15); + float32x2_t v55 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v54)), 15); + float32x2_t v63 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v62)), 15); + float32x2_t v69 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v68)), 15); + float32x2_t v77 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v76)), 15); + float32x2_t v91 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v90)), 15); + float32x2_t v97 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v96)), 15); + float32x2_t v105 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v104)), 15); + float32x2_t v111 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v110)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v56 = vadd_f32(v49, v55); + float32x2_t v57 = vsub_f32(v49, v55); + float32x2_t v70 = vadd_f32(v63, v69); + float32x2_t v71 = vsub_f32(v63, v69); + float32x2_t v84 = vadd_f32(v77, v83); + float32x2_t v85 = vsub_f32(v77, v83); + float32x2_t v98 = vadd_f32(v91, v97); + float32x2_t v99 = vsub_f32(v91, v97); + float32x2_t v112 = vadd_f32(v105, v111); + float32x2_t v113 = vsub_f32(v105, v111); + float32x2_t v114 = vadd_f32(v42, v112); + float32x2_t v115 = vsub_f32(v42, v112); + float32x2_t v116 = vadd_f32(v84, v70); + float32x2_t v117 = vsub_f32(v84, v70); + float32x2_t v118 = vadd_f32(v56, v98); + float32x2_t v119 = vsub_f32(v56, v98); + float32x2_t v198 = vadd_f32(v43, v113); + float32x2_t v199 = vsub_f32(v43, v113); + float32x2_t v200 = vadd_f32(v85, v71); + float32x2_t v201 = vsub_f32(v85, v71); + float32x2_t v202 = vadd_f32(v57, v99); + float32x2_t v203 = vsub_f32(v57, v99); + float32x2_t v120 = vadd_f32(v114, v116); + float32x2_t v123 = vsub_f32(v114, v116); + float32x2_t v124 = vsub_f32(v116, v118); + float32x2_t v125 = vsub_f32(v118, v114); + float32x2_t v126 = vadd_f32(v115, v117); + float32x2_t v128 = vsub_f32(v115, v117); + float32x2_t v129 = vsub_f32(v117, v119); + float32x2_t v130 = vsub_f32(v119, v115); + float32x2_t v204 = vadd_f32(v198, v200); + float32x2_t v207 = vsub_f32(v198, v200); + float32x2_t v208 = vsub_f32(v200, v202); + float32x2_t v209 = vsub_f32(v202, v198); + float32x2_t v210 = vadd_f32(v199, v201); + float32x2_t v212 = vsub_f32(v199, v201); + float32x2_t v213 = vsub_f32(v201, v203); + float32x2_t v214 = vsub_f32(v203, v199); + float32x2_t v121 = vadd_f32(v120, v118); + float32x2_t v127 = vadd_f32(v126, v119); + float32x2_t v142 = vmul_f32(v123, v225); + float32x2_t v146 = vmul_f32(v124, v229); + float32x2_t v150 = vmul_f32(v125, v233); + float32x2_t v163 = vrev64_f32(v128); + float32x2_t v170 = vrev64_f32(v129); + float32x2_t v177 = vrev64_f32(v130); + float32x2_t v205 = vadd_f32(v204, v202); + float32x2_t v211 = vadd_f32(v210, v203); + float32x2_t v226 = vmul_f32(v207, v225); + float32x2_t v230 = vmul_f32(v208, v229); + float32x2_t v234 = vmul_f32(v209, v233); + float32x2_t v247 = vrev64_f32(v212); + float32x2_t v254 = vrev64_f32(v213); + float32x2_t v261 = vrev64_f32(v214); + float32x2_t v122 = vadd_f32(v121, v28); + float32x2_t v138 = vmul_f32(v121, v221); + float32x2_t v156 = vrev64_f32(v127); + float32x2_t v164 = vmul_f32(v163, v246); + float32x2_t v171 = vmul_f32(v170, v253); + float32x2_t v178 = vmul_f32(v177, v260); + float32x2_t v206 = vadd_f32(v205, v29); + float32x2_t v222 = vmul_f32(v205, v221); + float32x2_t v240 = vrev64_f32(v211); + float32x2_t v248 = vmul_f32(v247, v246); + float32x2_t v255 = vmul_f32(v254, v253); + float32x2_t v262 = vmul_f32(v261, v260); + float32x2_t v157 = vmul_f32(v156, v239); + float32x2_t v179 = vadd_f32(v122, v138); + float32x2_t v241 = vmul_f32(v240, v239); + float32x2_t v263 = vadd_f32(v206, v222); + int16x4_t v284 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v122, 15), (int32x2_t){0, 0})); + int16x4_t v290 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v206, 15), (int32x2_t){0, 0})); + float32x2_t v180 = vadd_f32(v179, v142); + float32x2_t v182 = vsub_f32(v179, v142); + float32x2_t v184 = vsub_f32(v179, v146); + float32x2_t v186 = vadd_f32(v157, v164); + float32x2_t v188 = vsub_f32(v157, v164); + float32x2_t v190 = vsub_f32(v157, v171); + float32x2_t v264 = vadd_f32(v263, v226); + float32x2_t v266 = vsub_f32(v263, v226); + float32x2_t v268 = vsub_f32(v263, v230); + float32x2_t v270 = vadd_f32(v241, v248); + float32x2_t v272 = vsub_f32(v241, v248); + float32x2_t v274 = vsub_f32(v241, v255); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v284), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v290), 0); + float32x2_t v181 = vadd_f32(v180, v146); + float32x2_t v183 = vsub_f32(v182, v150); + float32x2_t v185 = vadd_f32(v184, v150); + float32x2_t v187 = vadd_f32(v186, v171); + float32x2_t v189 = vsub_f32(v188, v178); + float32x2_t v191 = vadd_f32(v190, v178); + float32x2_t v265 = vadd_f32(v264, v230); + float32x2_t v267 = vsub_f32(v266, v234); + float32x2_t v269 = vadd_f32(v268, v234); + float32x2_t v271 = vadd_f32(v270, v255); + float32x2_t v273 = vsub_f32(v272, v262); + float32x2_t v275 = vadd_f32(v274, v262); + float32x2_t v192 = vadd_f32(v181, v187); + float32x2_t v193 = vsub_f32(v181, v187); + float32x2_t v194 = vadd_f32(v183, v189); + float32x2_t v195 = vsub_f32(v183, v189); + float32x2_t v196 = vadd_f32(v185, v191); + float32x2_t v197 = vsub_f32(v185, v191); + float32x2_t v276 = vadd_f32(v265, v271); + float32x2_t v277 = vsub_f32(v265, v271); + float32x2_t v278 = vadd_f32(v267, v273); + float32x2_t v279 = vsub_f32(v267, v273); + float32x2_t v280 = vadd_f32(v269, v275); + float32x2_t v281 = vsub_f32(v269, v275); + int16x4_t v296 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v193, 15), (int32x2_t){0, 0})); + int16x4_t v302 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v277, 15), (int32x2_t){0, 0})); + int16x4_t v308 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v195, 15), (int32x2_t){0, 0})); + int16x4_t v314 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v279, 15), (int32x2_t){0, 0})); + int16x4_t v320 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v196, 15), (int32x2_t){0, 0})); + int16x4_t v326 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v280, 15), (int32x2_t){0, 0})); + int16x4_t v332 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v197, 15), (int32x2_t){0, 0})); + int16x4_t v338 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v281, 15), (int32x2_t){0, 0})); + int16x4_t v344 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v194, 15), (int32x2_t){0, 0})); + int16x4_t v350 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v278, 15), (int32x2_t){0, 0})); + int16x4_t v356 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v192, 15), (int32x2_t){0, 0})); + int16x4_t v362 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v276, 15), (int32x2_t){0, 0})); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v296), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v302), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v308), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v314), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v320), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v326), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v332), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v338), 0); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v344), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v350), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v356), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v362), 0); + v5 += 1 * 1; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu14(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * 1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v256 = -1.1666666666666665e+00F; + float v261 = 7.9015646852540022e-01F; + float v266 = 5.5854267289647742e-02F; + float v271 = 7.3430220123575241e-01F; + float v276 = -4.4095855184409838e-01F; + float v283 = -3.4087293062393137e-01F; + float v290 = 5.3396936033772524e-01F; + float v297 = -8.7484229096165667e-01F; + const int32_t *v522 = &v5[v0]; + int32_t *v613 = &v6[v2]; + int64_t v27 = v0 * 7; + int64_t v37 = v0 * 2; + int64_t v45 = v0 * 9; + int64_t v55 = v0 * 4; + int64_t v63 = v0 * 11; + int64_t v73 = v0 * 6; + int64_t v81 = v0 * 13; + int64_t v91 = v0 * 8; + int64_t v109 = v0 * 10; + int64_t v117 = v0 * 3; + int64_t v127 = v0 * 12; + int64_t v135 = v0 * 5; + float v279 = v4 * v276; + float v286 = v4 * v283; + float v293 = v4 * v290; + float v300 = v4 * v297; + int64_t v331 = v2 * 7; + int64_t v339 = v2 * 8; + int64_t v355 = v2 * 2; + int64_t v363 = v2 * 9; + int64_t v371 = v2 * 10; + int64_t v379 = v2 * 3; + int64_t v387 = v2 * 4; + int64_t v395 = v2 * 11; + int64_t v403 = v2 * 12; + int64_t v411 = v2 * 5; + int64_t v419 = v2 * 6; + int64_t v427 = v2 * 13; + const int32_t *v441 = &v5[0]; + svfloat32_t v571 = svdup_n_f32(v256); + svfloat32_t v572 = svdup_n_f32(v261); + svfloat32_t v573 = svdup_n_f32(v266); + svfloat32_t v574 = svdup_n_f32(v271); + int32_t *v586 = &v6[0]; + svfloat32_t v105 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v522[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v450 = &v5[v27]; + const int32_t *v459 = &v5[v37]; + const int32_t *v468 = &v5[v45]; + const int32_t *v477 = &v5[v55]; + const int32_t *v486 = &v5[v63]; + const int32_t *v495 = &v5[v73]; + const int32_t *v504 = &v5[v81]; + const int32_t *v513 = &v5[v91]; + const int32_t *v531 = &v5[v109]; + const int32_t *v540 = &v5[v117]; + const int32_t *v549 = &v5[v127]; + const int32_t *v558 = &v5[v135]; + svfloat32_t v575 = svdup_n_f32(v279); + svfloat32_t v576 = svdup_n_f32(v286); + svfloat32_t v577 = svdup_n_f32(v293); + svfloat32_t v578 = svdup_n_f32(v300); + int32_t *v595 = &v6[v331]; + int32_t *v604 = &v6[v339]; + int32_t *v622 = &v6[v355]; + int32_t *v631 = &v6[v363]; + int32_t *v640 = &v6[v371]; + int32_t *v649 = &v6[v379]; + int32_t *v658 = &v6[v387]; + int32_t *v667 = &v6[v395]; + int32_t *v676 = &v6[v403]; + int32_t *v685 = &v6[v411]; + int32_t *v694 = &v6[v419]; + int32_t *v703 = &v6[v427]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v441[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v450[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v459[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v51 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v468[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v61 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v477[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v69 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v486[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v79 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v495[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v87 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v504[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v97 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v513[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v115 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v531[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v123 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v540[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v133 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v549[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v141 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v558[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v70 = svadd_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v71 = svsub_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v106 = svadd_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v52, v142); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v52, v142); + svfloat32_t v146 = svadd_f32_x(svptrue_b32(), v106, v88); + svfloat32_t v147 = svsub_f32_x(svptrue_b32(), v106, v88); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v70, v124); + svfloat32_t v149 = svsub_f32_x(svptrue_b32(), v70, v124); + svfloat32_t v233 = svadd_f32_x(svptrue_b32(), v53, v143); + svfloat32_t v234 = svsub_f32_x(svptrue_b32(), v53, v143); + svfloat32_t v235 = svadd_f32_x(svptrue_b32(), v107, v89); + svfloat32_t v236 = svsub_f32_x(svptrue_b32(), v107, v89); + svfloat32_t v237 = svadd_f32_x(svptrue_b32(), v71, v125); + svfloat32_t v238 = svsub_f32_x(svptrue_b32(), v71, v125); + svfloat32_t v150 = svadd_f32_x(svptrue_b32(), v144, v146); + svfloat32_t v153 = svsub_f32_x(svptrue_b32(), v144, v146); + svfloat32_t v154 = svsub_f32_x(svptrue_b32(), v146, v148); + svfloat32_t v155 = svsub_f32_x(svptrue_b32(), v148, v144); + svfloat32_t v156 = svadd_f32_x(svptrue_b32(), v145, v147); + svfloat32_t v158 = svsub_f32_x(svptrue_b32(), v145, v147); + svfloat32_t v159 = svsub_f32_x(svptrue_b32(), v147, v149); + svfloat32_t v160 = svsub_f32_x(svptrue_b32(), v149, v145); + svfloat32_t v239 = svadd_f32_x(svptrue_b32(), v233, v235); + svfloat32_t v242 = svsub_f32_x(svptrue_b32(), v233, v235); + svfloat32_t v243 = svsub_f32_x(svptrue_b32(), v235, v237); + svfloat32_t v244 = svsub_f32_x(svptrue_b32(), v237, v233); + svfloat32_t v245 = svadd_f32_x(svptrue_b32(), v234, v236); + svfloat32_t v247 = svsub_f32_x(svptrue_b32(), v234, v236); + svfloat32_t v248 = svsub_f32_x(svptrue_b32(), v236, v238); + svfloat32_t v249 = svsub_f32_x(svptrue_b32(), v238, v234); + svfloat32_t v151 = svadd_f32_x(svptrue_b32(), v150, v148); + svfloat32_t v157 = svadd_f32_x(svptrue_b32(), v156, v149); + svfloat32_t zero199 = svdup_n_f32(0); + svfloat32_t v199 = svcmla_f32_x(pred_full, zero199, v576, v158, 90); + svfloat32_t zero206 = svdup_n_f32(0); + svfloat32_t v206 = svcmla_f32_x(pred_full, zero206, v577, v159, 90); + svfloat32_t zero213 = svdup_n_f32(0); + svfloat32_t v213 = svcmla_f32_x(pred_full, zero213, v578, v160, 90); + svfloat32_t v240 = svadd_f32_x(svptrue_b32(), v239, v237); + svfloat32_t v246 = svadd_f32_x(svptrue_b32(), v245, v238); + svfloat32_t zero288 = svdup_n_f32(0); + svfloat32_t v288 = svcmla_f32_x(pred_full, zero288, v576, v247, 90); + svfloat32_t zero295 = svdup_n_f32(0); + svfloat32_t v295 = svcmla_f32_x(pred_full, zero295, v577, v248, 90); + svfloat32_t zero302 = svdup_n_f32(0); + svfloat32_t v302 = svcmla_f32_x(pred_full, zero302, v578, v249, 90); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v151, v34); + svfloat32_t zero192 = svdup_n_f32(0); + svfloat32_t v192 = svcmla_f32_x(pred_full, zero192, v575, v157, 90); + svfloat32_t v241 = svadd_f32_x(svptrue_b32(), v240, v35); + svfloat32_t zero281 = svdup_n_f32(0); + svfloat32_t v281 = svcmla_f32_x(pred_full, zero281, v575, v246, 90); + svfloat32_t v214 = svmla_f32_x(pred_full, v152, v151, v571); + svfloat32_t v221 = svadd_f32_x(svptrue_b32(), v192, v199); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v192, v199); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v192, v206); + svfloat32_t v303 = svmla_f32_x(pred_full, v241, v240, v571); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v281, v288); + svfloat32_t v312 = svsub_f32_x(svptrue_b32(), v281, v288); + svfloat32_t v314 = svsub_f32_x(svptrue_b32(), v281, v295); + svint16_t v324 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v152, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v332 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v241, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v215 = svmla_f32_x(pred_full, v214, v153, v572); + svfloat32_t v217 = svmls_f32_x(pred_full, v214, v153, v572); + svfloat32_t v219 = svmls_f32_x(pred_full, v214, v154, v573); + svfloat32_t v222 = svadd_f32_x(svptrue_b32(), v221, v206); + svfloat32_t v224 = svsub_f32_x(svptrue_b32(), v223, v213); + svfloat32_t v226 = svadd_f32_x(svptrue_b32(), v225, v213); + svfloat32_t v304 = svmla_f32_x(pred_full, v303, v242, v572); + svfloat32_t v306 = svmls_f32_x(pred_full, v303, v242, v572); + svfloat32_t v308 = svmls_f32_x(pred_full, v303, v243, v573); + svfloat32_t v311 = svadd_f32_x(svptrue_b32(), v310, v295); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v312, v302); + svfloat32_t v315 = svadd_f32_x(svptrue_b32(), v314, v302); + svst1w_u64(pred_full, (unsigned *)(v586), svreinterpret_u64_s16(v324)); + svst1w_u64(pred_full, (unsigned *)(v595), svreinterpret_u64_s16(v332)); + svfloat32_t v216 = svmla_f32_x(pred_full, v215, v154, v573); + svfloat32_t v218 = svmls_f32_x(pred_full, v217, v155, v574); + svfloat32_t v220 = svmla_f32_x(pred_full, v219, v155, v574); + svfloat32_t v305 = svmla_f32_x(pred_full, v304, v243, v573); + svfloat32_t v307 = svmls_f32_x(pred_full, v306, v244, v574); + svfloat32_t v309 = svmla_f32_x(pred_full, v308, v244, v574); + svfloat32_t v227 = svadd_f32_x(svptrue_b32(), v216, v222); + svfloat32_t v228 = svsub_f32_x(svptrue_b32(), v216, v222); + svfloat32_t v229 = svadd_f32_x(svptrue_b32(), v218, v224); + svfloat32_t v230 = svsub_f32_x(svptrue_b32(), v218, v224); + svfloat32_t v231 = svadd_f32_x(svptrue_b32(), v220, v226); + svfloat32_t v232 = svsub_f32_x(svptrue_b32(), v220, v226); + svfloat32_t v316 = svadd_f32_x(svptrue_b32(), v305, v311); + svfloat32_t v317 = svsub_f32_x(svptrue_b32(), v305, v311); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v307, v313); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v307, v313); + svfloat32_t v320 = svadd_f32_x(svptrue_b32(), v309, v315); + svfloat32_t v321 = svsub_f32_x(svptrue_b32(), v309, v315); + svint16_t v340 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v228, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v348 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v317, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v356 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v230, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v364 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v319, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v372 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v231, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v380 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v320, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v388 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v232, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v396 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v321, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v404 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v229, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v412 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v318, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v420 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v227, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v428 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v316, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v604), svreinterpret_u64_s16(v340)); + svst1w_u64(pred_full, (unsigned *)(v613), svreinterpret_u64_s16(v348)); + svst1w_u64(pred_full, (unsigned *)(v622), svreinterpret_u64_s16(v356)); + svst1w_u64(pred_full, (unsigned *)(v631), svreinterpret_u64_s16(v364)); + svst1w_u64(pred_full, (unsigned *)(v640), svreinterpret_u64_s16(v372)); + svst1w_u64(pred_full, (unsigned *)(v649), svreinterpret_u64_s16(v380)); + svst1w_u64(pred_full, (unsigned *)(v658), svreinterpret_u64_s16(v388)); + svst1w_u64(pred_full, (unsigned *)(v667), svreinterpret_u64_s16(v396)); + svst1w_u64(pred_full, (unsigned *)(v676), svreinterpret_u64_s16(v404)); + svst1w_u64(pred_full, (unsigned *)(v685), svreinterpret_u64_s16(v412)); + svst1w_u64(pred_full, (unsigned *)(v694), svreinterpret_u64_s16(v420)); + svst1w_u64(pred_full, (unsigned *)(v703), svreinterpret_u64_s16(v428)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu15(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v68 = vld1s_s16(&v5[istride]); + float v134 = -1.2500000000000000e+00F; + float v138 = 5.5901699437494745e-01F; + float v141 = 1.5388417685876268e+00F; + float v142 = -1.5388417685876268e+00F; + float v148 = 5.8778525229247325e-01F; + float v149 = -5.8778525229247325e-01F; + float v155 = 3.6327126400268028e-01F; + float v156 = -3.6327126400268028e-01F; + float v180 = -1.4999999999999998e+00F; + float v184 = 1.8749999999999998e+00F; + float v188 = -8.3852549156242107e-01F; + float v191 = -2.3082626528814396e+00F; + float v192 = 2.3082626528814396e+00F; + float v198 = -8.8167787843870971e-01F; + float v199 = 8.8167787843870971e-01F; + float v205 = -5.4490689600402031e-01F; + float v206 = 5.4490689600402031e-01F; + float v229 = 8.6602540378443871e-01F; + float v230 = -8.6602540378443871e-01F; + float v236 = -1.0825317547305484e+00F; + float v237 = 1.0825317547305484e+00F; + float v243 = 4.8412291827592718e-01F; + float v244 = -4.8412291827592718e-01F; + float32x2_t v246 = (float32x2_t){v4, v4}; + float v251 = -1.3326760640014592e+00F; + float v255 = -5.0903696045512736e-01F; + float v259 = -3.1460214309120460e-01F; + int16x4_t v34 = vld1s_s16(&v5[0]); + float32x2_t v69 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v68)), 15); + float32x2_t v135 = (float32x2_t){v134, v134}; + float32x2_t v139 = (float32x2_t){v138, v138}; + float32x2_t v143 = (float32x2_t){v141, v142}; + float32x2_t v150 = (float32x2_t){v148, v149}; + float32x2_t v157 = (float32x2_t){v155, v156}; + float32x2_t v181 = (float32x2_t){v180, v180}; + float32x2_t v185 = (float32x2_t){v184, v184}; + float32x2_t v189 = (float32x2_t){v188, v188}; + float32x2_t v193 = (float32x2_t){v191, v192}; + float32x2_t v200 = (float32x2_t){v198, v199}; + float32x2_t v207 = (float32x2_t){v205, v206}; + float32x2_t v231 = (float32x2_t){v229, v230}; + float32x2_t v238 = (float32x2_t){v236, v237}; + float32x2_t v245 = (float32x2_t){v243, v244}; + float32x2_t v252 = (float32x2_t){v251, v251}; + float32x2_t v256 = (float32x2_t){v255, v255}; + float32x2_t v260 = (float32x2_t){v259, v259}; + int16x4_t v20 = vld1s_s16(&v5[istride * 5]); + int16x4_t v26 = vld1s_s16(&v5[istride * 10]); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + int16x4_t v41 = vld1s_s16(&v5[istride * 8]); + int16x4_t v47 = vld1s_s16(&v5[istride * 13]); + int16x4_t v55 = vld1s_s16(&v5[istride * 3]); + int16x4_t v62 = vld1s_s16(&v5[istride * 11]); + int16x4_t v76 = vld1s_s16(&v5[istride * 6]); + int16x4_t v83 = vld1s_s16(&v5[istride * 14]); + int16x4_t v89 = vld1s_s16(&v5[istride * 4]); + int16x4_t v97 = vld1s_s16(&v5[istride * 9]); + int16x4_t v104 = vld1s_s16(&v5[istride * 2]); + int16x4_t v110 = vld1s_s16(&v5[istride * 7]); + int16x4_t v118 = vld1s_s16(&v5[istride * 12]); + float32x2_t v145 = vmul_f32(v246, v143); + float32x2_t v152 = vmul_f32(v246, v150); + float32x2_t v159 = vmul_f32(v246, v157); + float32x2_t v195 = vmul_f32(v246, v193); + float32x2_t v202 = vmul_f32(v246, v200); + float32x2_t v209 = vmul_f32(v246, v207); + float32x2_t v233 = vmul_f32(v246, v231); + float32x2_t v240 = vmul_f32(v246, v238); + float32x2_t v247 = vmul_f32(v246, v245); + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v42 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v41)), 15); + float32x2_t v48 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v47)), 15); + float32x2_t v56 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v55)), 15); + float32x2_t v63 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v62)), 15); + float32x2_t v77 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v76)), 15); + float32x2_t v84 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v83)), 15); + float32x2_t v90 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v89)), 15); + float32x2_t v98 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v97)), 15); + float32x2_t v105 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v104)), 15); + float32x2_t v111 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v110)), 15); + float32x2_t v119 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v118)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v49 = vadd_f32(v42, v48); + float32x2_t v50 = vsub_f32(v42, v48); + float32x2_t v70 = vadd_f32(v63, v69); + float32x2_t v71 = vsub_f32(v63, v69); + float32x2_t v91 = vadd_f32(v84, v90); + float32x2_t v92 = vsub_f32(v84, v90); + float32x2_t v112 = vadd_f32(v105, v111); + float32x2_t v113 = vsub_f32(v105, v111); + float32x2_t v36 = vadd_f32(v28, v35); + float32x2_t v57 = vadd_f32(v49, v56); + float32x2_t v78 = vadd_f32(v70, v77); + float32x2_t v99 = vadd_f32(v91, v98); + float32x2_t v120 = vadd_f32(v112, v119); + float32x2_t v171 = vadd_f32(v49, v112); + float32x2_t v172 = vsub_f32(v49, v112); + float32x2_t v173 = vadd_f32(v91, v70); + float32x2_t v174 = vsub_f32(v91, v70); + float32x2_t v221 = vadd_f32(v50, v113); + float32x2_t v222 = vsub_f32(v50, v113); + float32x2_t v223 = vadd_f32(v92, v71); + float32x2_t v224 = vsub_f32(v92, v71); + float32x2_t v121 = vadd_f32(v57, v120); + float32x2_t v122 = vsub_f32(v57, v120); + float32x2_t v123 = vadd_f32(v99, v78); + float32x2_t v124 = vsub_f32(v99, v78); + float32x2_t v175 = vadd_f32(v171, v173); + float32x2_t v176 = vsub_f32(v171, v173); + float32x2_t v177 = vadd_f32(v172, v174); + float32x2_t v196 = vrev64_f32(v172); + float32x2_t v210 = vrev64_f32(v174); + float32x2_t v225 = vadd_f32(v221, v223); + float32x2_t v226 = vsub_f32(v221, v223); + float32x2_t v227 = vadd_f32(v222, v224); + float32x2_t v253 = vmul_f32(v222, v252); + float32x2_t v261 = vmul_f32(v224, v260); + float32x2_t v125 = vadd_f32(v121, v123); + float32x2_t v126 = vsub_f32(v121, v123); + float32x2_t v127 = vadd_f32(v122, v124); + float32x2_t v146 = vrev64_f32(v122); + float32x2_t v160 = vrev64_f32(v124); + float32x2_t v178 = vadd_f32(v175, v28); + float32x2_t v186 = vmul_f32(v175, v185); + float32x2_t v190 = vmul_f32(v176, v189); + float32x2_t v197 = vmul_f32(v196, v195); + float32x2_t v203 = vrev64_f32(v177); + float32x2_t v211 = vmul_f32(v210, v209); + float32x2_t v228 = vadd_f32(v225, v29); + float32x2_t v241 = vrev64_f32(v225); + float32x2_t v248 = vrev64_f32(v226); + float32x2_t v257 = vmul_f32(v227, v256); + float32x2_t v128 = vadd_f32(v125, v36); + float32x2_t v136 = vmul_f32(v125, v135); + float32x2_t v140 = vmul_f32(v126, v139); + float32x2_t v147 = vmul_f32(v146, v145); + float32x2_t v153 = vrev64_f32(v127); + float32x2_t v161 = vmul_f32(v160, v159); + float32x2_t v182 = vmul_f32(v178, v181); + float32x2_t v204 = vmul_f32(v203, v202); + float32x2_t v234 = vrev64_f32(v228); + float32x2_t v242 = vmul_f32(v241, v240); + float32x2_t v249 = vmul_f32(v248, v247); + float32x2_t v265 = vsub_f32(v253, v257); + float32x2_t v266 = vadd_f32(v257, v261); + float32x2_t v154 = vmul_f32(v153, v152); + float32x2_t v162 = vadd_f32(v128, v136); + float32x2_t v212 = vadd_f32(v182, v186); + float32x2_t v215 = vsub_f32(v197, v204); + float32x2_t v216 = vadd_f32(v204, v211); + float32x2_t v235 = vmul_f32(v234, v233); + float32x2_t v271 = vadd_f32(v128, v182); + int16x4_t v276 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v128, 15), (int32x2_t){0, 0})); + float32x2_t v163 = vadd_f32(v162, v140); + float32x2_t v164 = vsub_f32(v162, v140); + float32x2_t v165 = vsub_f32(v147, v154); + float32x2_t v166 = vadd_f32(v154, v161); + float32x2_t v213 = vadd_f32(v212, v190); + float32x2_t v214 = vsub_f32(v212, v190); + float32x2_t v262 = vadd_f32(v235, v242); + float32x2_t v272 = vadd_f32(v271, v235); + float32x2_t v273 = vsub_f32(v271, v235); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v276), 0); + float32x2_t v167 = vadd_f32(v163, v165); + float32x2_t v168 = vsub_f32(v163, v165); + float32x2_t v169 = vadd_f32(v164, v166); + float32x2_t v170 = vsub_f32(v164, v166); + float32x2_t v217 = vadd_f32(v213, v215); + float32x2_t v218 = vsub_f32(v213, v215); + float32x2_t v219 = vadd_f32(v214, v216); + float32x2_t v220 = vsub_f32(v214, v216); + float32x2_t v263 = vadd_f32(v262, v249); + float32x2_t v264 = vsub_f32(v262, v249); + int16x4_t v282 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v273, 15), (int32x2_t){0, 0})); + int16x4_t v288 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v272, 15), (int32x2_t){0, 0})); + float32x2_t v267 = vadd_f32(v263, v265); + float32x2_t v268 = vsub_f32(v263, v265); + float32x2_t v269 = vadd_f32(v264, v266); + float32x2_t v270 = vsub_f32(v264, v266); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v282), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v288), 0); + float32x2_t v292 = vadd_f32(v168, v218); + int16x4_t v297 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v168, 15), (int32x2_t){0, 0})); + float32x2_t v313 = vadd_f32(v170, v220); + int16x4_t v318 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v170, 15), (int32x2_t){0, 0})); + float32x2_t v334 = vadd_f32(v169, v219); + int16x4_t v339 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v169, 15), (int32x2_t){0, 0})); + float32x2_t v355 = vadd_f32(v167, v217); + int16x4_t v360 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v167, 15), (int32x2_t){0, 0})); + float32x2_t v293 = vadd_f32(v292, v268); + float32x2_t v294 = vsub_f32(v292, v268); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v297), 0); + float32x2_t v314 = vadd_f32(v313, v270); + float32x2_t v315 = vsub_f32(v313, v270); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v318), 0); + float32x2_t v335 = vadd_f32(v334, v269); + float32x2_t v336 = vsub_f32(v334, v269); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v339), 0); + float32x2_t v356 = vadd_f32(v355, v267); + float32x2_t v357 = vsub_f32(v355, v267); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v360), 0); + int16x4_t v303 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v294, 15), (int32x2_t){0, 0})); + int16x4_t v309 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v293, 15), (int32x2_t){0, 0})); + int16x4_t v324 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v315, 15), (int32x2_t){0, 0})); + int16x4_t v330 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v314, 15), (int32x2_t){0, 0})); + int16x4_t v345 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v336, 15), (int32x2_t){0, 0})); + int16x4_t v351 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v335, 15), (int32x2_t){0, 0})); + int16x4_t v366 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v357, 15), (int32x2_t){0, 0})); + int16x4_t v372 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v356, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v303), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v309), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v324), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v330), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v345), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v351), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v366), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v372), 0); + v5 += 1 * 1; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu15(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * 1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v167 = -1.2500000000000000e+00F; + float v172 = 5.5901699437494745e-01F; + float v177 = -1.5388417685876268e+00F; + float v184 = -5.8778525229247325e-01F; + float v191 = -3.6327126400268028e-01F; + float v215 = -1.4999999999999998e+00F; + float v220 = 1.8749999999999998e+00F; + float v225 = -8.3852549156242107e-01F; + float v230 = 2.3082626528814396e+00F; + float v237 = 8.8167787843870971e-01F; + float v244 = 5.4490689600402031e-01F; + float v268 = -8.6602540378443871e-01F; + float v275 = 1.0825317547305484e+00F; + float v282 = -4.8412291827592718e-01F; + float v289 = -1.3326760640014592e+00F; + float v294 = -5.0903696045512736e-01F; + float v299 = -3.1460214309120460e-01F; + const int32_t *v517 = &v5[v0]; + int32_t *v644 = &v6[v2]; + int64_t v19 = v0 * 5; + int64_t v27 = v0 * 10; + int64_t v46 = v0 * 8; + int64_t v54 = v0 * 13; + int64_t v64 = v0 * 3; + int64_t v73 = v0 * 11; + int64_t v91 = v0 * 6; + int64_t v100 = v0 * 14; + int64_t v108 = v0 * 4; + int64_t v118 = v0 * 9; + int64_t v127 = v0 * 2; + int64_t v135 = v0 * 7; + int64_t v145 = v0 * 12; + float v180 = v4 * v177; + float v187 = v4 * v184; + float v194 = v4 * v191; + float v233 = v4 * v230; + float v240 = v4 * v237; + float v247 = v4 * v244; + float v271 = v4 * v268; + float v278 = v4 * v275; + float v285 = v4 * v282; + int64_t v324 = v2 * 10; + int64_t v332 = v2 * 5; + int64_t v343 = v2 * 6; + int64_t v359 = v2 * 11; + int64_t v370 = v2 * 12; + int64_t v378 = v2 * 7; + int64_t v386 = v2 * 2; + int64_t v397 = v2 * 3; + int64_t v405 = v2 * 13; + int64_t v413 = v2 * 8; + int64_t v424 = v2 * 9; + int64_t v432 = v2 * 4; + int64_t v440 = v2 * 14; + const int32_t *v472 = &v5[0]; + svfloat32_t v584 = svdup_n_f32(v167); + svfloat32_t v585 = svdup_n_f32(v172); + svfloat32_t v589 = svdup_n_f32(v215); + svfloat32_t v590 = svdup_n_f32(v220); + svfloat32_t v591 = svdup_n_f32(v225); + svfloat32_t v598 = svdup_n_f32(v289); + svfloat32_t v599 = svdup_n_f32(v294); + svfloat32_t v600 = svdup_n_f32(v299); + int32_t *v608 = &v6[0]; + svfloat32_t v87 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v517[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v453 = &v5[v19]; + const int32_t *v462 = &v5[v27]; + const int32_t *v481 = &v5[v46]; + const int32_t *v490 = &v5[v54]; + const int32_t *v499 = &v5[v64]; + const int32_t *v508 = &v5[v73]; + const int32_t *v526 = &v5[v91]; + const int32_t *v535 = &v5[v100]; + const int32_t *v544 = &v5[v108]; + const int32_t *v553 = &v5[v118]; + const int32_t *v562 = &v5[v127]; + const int32_t *v571 = &v5[v135]; + const int32_t *v580 = &v5[v145]; + svfloat32_t v586 = svdup_n_f32(v180); + svfloat32_t v587 = svdup_n_f32(v187); + svfloat32_t v588 = svdup_n_f32(v194); + svfloat32_t v592 = svdup_n_f32(v233); + svfloat32_t v593 = svdup_n_f32(v240); + svfloat32_t v594 = svdup_n_f32(v247); + svfloat32_t v595 = svdup_n_f32(v271); + svfloat32_t v596 = svdup_n_f32(v278); + svfloat32_t v597 = svdup_n_f32(v285); + int32_t *v617 = &v6[v324]; + int32_t *v626 = &v6[v332]; + int32_t *v635 = &v6[v343]; + int32_t *v653 = &v6[v359]; + int32_t *v662 = &v6[v370]; + int32_t *v671 = &v6[v378]; + int32_t *v680 = &v6[v386]; + int32_t *v689 = &v6[v397]; + int32_t *v698 = &v6[v405]; + int32_t *v707 = &v6[v413]; + int32_t *v716 = &v6[v424]; + int32_t *v725 = &v6[v432]; + int32_t *v734 = &v6[v440]; + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v472[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v453[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v462[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v52 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v481[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v60 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v490[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v70 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v499[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v79 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v508[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v97 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v526[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v106 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v535[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v114 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v544[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v124 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v553[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v133 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v562[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v141 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v571[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v151 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v580[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v61 = svadd_f32_x(svptrue_b32(), v52, v60); + svfloat32_t v62 = svsub_f32_x(svptrue_b32(), v52, v60); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v115 = svadd_f32_x(svptrue_b32(), v106, v114); + svfloat32_t v116 = svsub_f32_x(svptrue_b32(), v106, v114); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v44 = svadd_f32_x(svptrue_b32(), v34, v43); + svfloat32_t v71 = svadd_f32_x(svptrue_b32(), v61, v70); + svfloat32_t v98 = svadd_f32_x(svptrue_b32(), v88, v97); + svfloat32_t v125 = svadd_f32_x(svptrue_b32(), v115, v124); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v142, v151); + svfloat32_t v206 = svadd_f32_x(svptrue_b32(), v61, v142); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v61, v142); + svfloat32_t v208 = svadd_f32_x(svptrue_b32(), v115, v88); + svfloat32_t v209 = svsub_f32_x(svptrue_b32(), v115, v88); + svfloat32_t v259 = svadd_f32_x(svptrue_b32(), v62, v143); + svfloat32_t v260 = svsub_f32_x(svptrue_b32(), v62, v143); + svfloat32_t v261 = svadd_f32_x(svptrue_b32(), v116, v89); + svfloat32_t v262 = svsub_f32_x(svptrue_b32(), v116, v89); + svfloat32_t v153 = svadd_f32_x(svptrue_b32(), v71, v152); + svfloat32_t v154 = svsub_f32_x(svptrue_b32(), v71, v152); + svfloat32_t v155 = svadd_f32_x(svptrue_b32(), v125, v98); + svfloat32_t v156 = svsub_f32_x(svptrue_b32(), v125, v98); + svfloat32_t v210 = svadd_f32_x(svptrue_b32(), v206, v208); + svfloat32_t v211 = svsub_f32_x(svptrue_b32(), v206, v208); + svfloat32_t v212 = svadd_f32_x(svptrue_b32(), v207, v209); + svfloat32_t zero235 = svdup_n_f32(0); + svfloat32_t v235 = svcmla_f32_x(pred_full, zero235, v592, v207, 90); + svfloat32_t v263 = svadd_f32_x(svptrue_b32(), v259, v261); + svfloat32_t v264 = svsub_f32_x(svptrue_b32(), v259, v261); + svfloat32_t v265 = svadd_f32_x(svptrue_b32(), v260, v262); + svfloat32_t v302 = svmul_f32_x(svptrue_b32(), v262, v600); + svfloat32_t v157 = svadd_f32_x(svptrue_b32(), v153, v155); + svfloat32_t v158 = svsub_f32_x(svptrue_b32(), v153, v155); + svfloat32_t v159 = svadd_f32_x(svptrue_b32(), v154, v156); + svfloat32_t zero182 = svdup_n_f32(0); + svfloat32_t v182 = svcmla_f32_x(pred_full, zero182, v586, v154, 90); + svfloat32_t v213 = svadd_f32_x(svptrue_b32(), v210, v34); + svfloat32_t v223 = svmul_f32_x(svptrue_b32(), v210, v590); + svfloat32_t zero242 = svdup_n_f32(0); + svfloat32_t v242 = svcmla_f32_x(pred_full, zero242, v593, v212, 90); + svfloat32_t v266 = svadd_f32_x(svptrue_b32(), v263, v35); + svfloat32_t zero287 = svdup_n_f32(0); + svfloat32_t v287 = svcmla_f32_x(pred_full, zero287, v597, v264, 90); + svfloat32_t v297 = svmul_f32_x(svptrue_b32(), v265, v599); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v157, v44); + svfloat32_t zero189 = svdup_n_f32(0); + svfloat32_t v189 = svcmla_f32_x(pred_full, zero189, v587, v159, 90); + svfloat32_t v253 = svsub_f32_x(svptrue_b32(), v235, v242); + svfloat32_t v254 = svcmla_f32_x(pred_full, v242, v594, v209, 90); + svfloat32_t zero273 = svdup_n_f32(0); + svfloat32_t v273 = svcmla_f32_x(pred_full, zero273, v595, v266, 90); + svfloat32_t v306 = svnmls_f32_x(pred_full, v297, v260, v598); + svfloat32_t v307 = svmla_f32_x(pred_full, v302, v265, v599); + svfloat32_t v197 = svmla_f32_x(pred_full, v160, v157, v584); + svfloat32_t v200 = svsub_f32_x(svptrue_b32(), v182, v189); + svfloat32_t v201 = svcmla_f32_x(pred_full, v189, v588, v156, 90); + svfloat32_t v250 = svmla_f32_x(pred_full, v223, v213, v589); + svfloat32_t v303 = svcmla_f32_x(pred_full, v273, v596, v263, 90); + svfloat32_t v312 = svmla_f32_x(pred_full, v160, v213, v589); + svint16_t v317 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v160, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v198 = svmla_f32_x(pred_full, v197, v158, v585); + svfloat32_t v199 = svmls_f32_x(pred_full, v197, v158, v585); + svfloat32_t v251 = svmla_f32_x(pred_full, v250, v211, v591); + svfloat32_t v252 = svmls_f32_x(pred_full, v250, v211, v591); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v303, v287); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v303, v287); + svfloat32_t v313 = svadd_f32_x(svptrue_b32(), v312, v273); + svfloat32_t v314 = svsub_f32_x(svptrue_b32(), v312, v273); + svst1w_u64(pred_full, (unsigned *)(v608), svreinterpret_u64_s16(v317)); + svfloat32_t v202 = svadd_f32_x(svptrue_b32(), v198, v200); + svfloat32_t v203 = svsub_f32_x(svptrue_b32(), v198, v200); + svfloat32_t v204 = svadd_f32_x(svptrue_b32(), v199, v201); + svfloat32_t v205 = svsub_f32_x(svptrue_b32(), v199, v201); + svfloat32_t v255 = svadd_f32_x(svptrue_b32(), v251, v253); + svfloat32_t v256 = svsub_f32_x(svptrue_b32(), v251, v253); + svfloat32_t v257 = svadd_f32_x(svptrue_b32(), v252, v254); + svfloat32_t v258 = svsub_f32_x(svptrue_b32(), v252, v254); + svfloat32_t v308 = svadd_f32_x(svptrue_b32(), v304, v306); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v304, v306); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v305, v307); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v305, v307); + svint16_t v325 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v314, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v333 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v313, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v339 = svadd_f32_x(svptrue_b32(), v203, v256); + svint16_t v344 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v203, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v366 = svadd_f32_x(svptrue_b32(), v205, v258); + svint16_t v371 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v205, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v393 = svadd_f32_x(svptrue_b32(), v204, v257); + svint16_t v398 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v204, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v420 = svadd_f32_x(svptrue_b32(), v202, v255); + svint16_t v425 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v202, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v617), svreinterpret_u64_s16(v325)); + svst1w_u64(pred_full, (unsigned *)(v626), svreinterpret_u64_s16(v333)); + svfloat32_t v340 = svadd_f32_x(svptrue_b32(), v339, v309); + svfloat32_t v341 = svsub_f32_x(svptrue_b32(), v339, v309); + svfloat32_t v367 = svadd_f32_x(svptrue_b32(), v366, v311); + svfloat32_t v368 = svsub_f32_x(svptrue_b32(), v366, v311); + svfloat32_t v394 = svadd_f32_x(svptrue_b32(), v393, v310); + svfloat32_t v395 = svsub_f32_x(svptrue_b32(), v393, v310); + svfloat32_t v421 = svadd_f32_x(svptrue_b32(), v420, v308); + svfloat32_t v422 = svsub_f32_x(svptrue_b32(), v420, v308); + svst1w_u64(pred_full, (unsigned *)(v635), svreinterpret_u64_s16(v344)); + svst1w_u64(pred_full, (unsigned *)(v662), svreinterpret_u64_s16(v371)); + svst1w_u64(pred_full, (unsigned *)(v689), svreinterpret_u64_s16(v398)); + svst1w_u64(pred_full, (unsigned *)(v716), svreinterpret_u64_s16(v425)); + svint16_t v352 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v341, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v360 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v340, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v379 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v368, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v387 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v367, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v406 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v395, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v414 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v394, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v433 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v422, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v441 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v421, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v644), svreinterpret_u64_s16(v352)); + svst1w_u64(pred_full, (unsigned *)(v653), svreinterpret_u64_s16(v360)); + svst1w_u64(pred_full, (unsigned *)(v671), svreinterpret_u64_s16(v379)); + svst1w_u64(pred_full, (unsigned *)(v680), svreinterpret_u64_s16(v387)); + svst1w_u64(pred_full, (unsigned *)(v698), svreinterpret_u64_s16(v406)); + svst1w_u64(pred_full, (unsigned *)(v707), svreinterpret_u64_s16(v414)); + svst1w_u64(pred_full, (unsigned *)(v725), svreinterpret_u64_s16(v433)); + svst1w_u64(pred_full, (unsigned *)(v734), svreinterpret_u64_s16(v441)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu16(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v76 = vld1s_s16(&v5[istride]); + float v197 = 1.0000000000000000e+00F; + float v198 = -1.0000000000000000e+00F; + float v205 = -7.0710678118654746e-01F; + float v212 = 7.0710678118654757e-01F; + float v215 = 9.2387953251128674e-01F; + float v216 = -9.2387953251128674e-01F; + float v223 = 5.4119610014619690e-01F; + float v230 = -1.3065629648763766e+00F; + float32x2_t v232 = (float32x2_t){v4, v4}; + float v237 = 3.8268343236508984e-01F; + float v241 = 1.3065629648763766e+00F; + float v245 = -5.4119610014619690e-01F; + int16x4_t v20 = vld1s_s16(&v5[0]); + float32x2_t v77 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v76)), 15); + float32x2_t v199 = (float32x2_t){v197, v198}; + float32x2_t v206 = (float32x2_t){v212, v205}; + float32x2_t v213 = (float32x2_t){v212, v212}; + float32x2_t v217 = (float32x2_t){v215, v216}; + float32x2_t v224 = (float32x2_t){v245, v223}; + float32x2_t v231 = (float32x2_t){v241, v230}; + float32x2_t v238 = (float32x2_t){v237, v237}; + float32x2_t v242 = (float32x2_t){v241, v241}; + float32x2_t v246 = (float32x2_t){v245, v245}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v26 = vld1s_s16(&v5[istride * 8]); + int16x4_t v34 = vld1s_s16(&v5[istride * 4]); + int16x4_t v40 = vld1s_s16(&v5[istride * 12]); + int16x4_t v48 = vld1s_s16(&v5[istride * 2]); + int16x4_t v54 = vld1s_s16(&v5[istride * 10]); + int16x4_t v62 = vld1s_s16(&v5[istride * 6]); + int16x4_t v68 = vld1s_s16(&v5[istride * 14]); + int16x4_t v82 = vld1s_s16(&v5[istride * 9]); + int16x4_t v90 = vld1s_s16(&v5[istride * 5]); + int16x4_t v96 = vld1s_s16(&v5[istride * 13]); + int16x4_t v104 = vld1s_s16(&v5[istride * 3]); + int16x4_t v110 = vld1s_s16(&v5[istride * 11]); + int16x4_t v118 = vld1s_s16(&v5[istride * 7]); + int16x4_t v124 = vld1s_s16(&v5[istride * 15]); + float32x2_t v201 = vmul_f32(v232, v199); + float32x2_t v208 = vmul_f32(v232, v206); + float32x2_t v219 = vmul_f32(v232, v217); + float32x2_t v226 = vmul_f32(v232, v224); + float32x2_t v233 = vmul_f32(v232, v231); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v49 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v48)), 15); + float32x2_t v55 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v54)), 15); + float32x2_t v63 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v62)), 15); + float32x2_t v69 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v68)), 15); + float32x2_t v83 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v82)), 15); + float32x2_t v91 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v90)), 15); + float32x2_t v97 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v96)), 15); + float32x2_t v105 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v104)), 15); + float32x2_t v111 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v110)), 15); + float32x2_t v119 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v118)), 15); + float32x2_t v125 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v124)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v56 = vadd_f32(v49, v55); + float32x2_t v57 = vsub_f32(v49, v55); + float32x2_t v70 = vadd_f32(v63, v69); + float32x2_t v71 = vsub_f32(v63, v69); + float32x2_t v84 = vadd_f32(v77, v83); + float32x2_t v85 = vsub_f32(v77, v83); + float32x2_t v98 = vadd_f32(v91, v97); + float32x2_t v99 = vsub_f32(v91, v97); + float32x2_t v112 = vadd_f32(v105, v111); + float32x2_t v113 = vsub_f32(v105, v111); + float32x2_t v126 = vadd_f32(v119, v125); + float32x2_t v127 = vsub_f32(v119, v125); + float32x2_t v128 = vadd_f32(v28, v42); + float32x2_t v129 = vsub_f32(v28, v42); + float32x2_t v130 = vadd_f32(v56, v70); + float32x2_t v131 = vsub_f32(v56, v70); + float32x2_t v132 = vadd_f32(v84, v98); + float32x2_t v133 = vsub_f32(v84, v98); + float32x2_t v134 = vadd_f32(v112, v126); + float32x2_t v135 = vsub_f32(v112, v126); + float32x2_t v144 = vadd_f32(v57, v71); + float32x2_t v145 = vsub_f32(v57, v71); + float32x2_t v146 = vadd_f32(v85, v127); + float32x2_t v147 = vsub_f32(v85, v127); + float32x2_t v148 = vadd_f32(v99, v113); + float32x2_t v149 = vsub_f32(v99, v113); + float32x2_t v202 = vrev64_f32(v43); + float32x2_t v136 = vadd_f32(v128, v130); + float32x2_t v137 = vsub_f32(v128, v130); + float32x2_t v138 = vadd_f32(v132, v134); + float32x2_t v139 = vsub_f32(v132, v134); + float32x2_t v142 = vadd_f32(v133, v135); + float32x2_t v143 = vsub_f32(v133, v135); + float32x2_t v150 = vadd_f32(v146, v148); + float32x2_t v151 = vadd_f32(v147, v149); + float32x2_t v180 = vrev64_f32(v131); + float32x2_t v203 = vmul_f32(v202, v201); + float32x2_t v209 = vrev64_f32(v144); + float32x2_t v214 = vmul_f32(v145, v213); + float32x2_t v227 = vrev64_f32(v146); + float32x2_t v234 = vrev64_f32(v148); + float32x2_t v243 = vmul_f32(v147, v242); + float32x2_t v247 = vmul_f32(v149, v246); + float32x2_t v140 = vadd_f32(v136, v138); + float32x2_t v141 = vsub_f32(v136, v138); + float32x2_t v169 = vrev64_f32(v139); + float32x2_t v181 = vmul_f32(v180, v201); + float32x2_t v187 = vrev64_f32(v142); + float32x2_t v192 = vmul_f32(v143, v213); + float32x2_t v210 = vmul_f32(v209, v208); + float32x2_t v220 = vrev64_f32(v150); + float32x2_t v228 = vmul_f32(v227, v226); + float32x2_t v235 = vmul_f32(v234, v233); + float32x2_t v239 = vmul_f32(v151, v238); + float32x2_t v258 = vadd_f32(v29, v214); + float32x2_t v259 = vsub_f32(v29, v214); + float32x2_t v170 = vmul_f32(v169, v201); + float32x2_t v188 = vmul_f32(v187, v208); + float32x2_t v221 = vmul_f32(v220, v219); + float32x2_t v250 = vadd_f32(v129, v192); + float32x2_t v252 = vsub_f32(v129, v192); + float32x2_t v260 = vadd_f32(v203, v210); + float32x2_t v261 = vsub_f32(v203, v210); + float32x2_t v264 = vsub_f32(v243, v239); + float32x2_t v265 = vsub_f32(v247, v239); + float32x2_t v266 = vsub_f32(v239, v243); + float32x2_t v267 = vsub_f32(v239, v247); + int16x4_t v294 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v140, 15), (int32x2_t){0, 0})); + int16x4_t v342 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v141, 15), (int32x2_t){0, 0})); + float32x2_t v248 = vadd_f32(v137, v170); + float32x2_t v249 = vsub_f32(v137, v170); + float32x2_t v251 = vadd_f32(v181, v188); + float32x2_t v253 = vsub_f32(v188, v181); + float32x2_t v262 = vadd_f32(v221, v228); + float32x2_t v263 = vsub_f32(v221, v235); + float32x2_t v268 = vadd_f32(v258, v264); + float32x2_t v269 = vsub_f32(v258, v264); + float32x2_t v270 = vadd_f32(v258, v266); + float32x2_t v271 = vsub_f32(v258, v266); + float32x2_t v272 = vadd_f32(v259, v261); + float32x2_t v273 = vsub_f32(v259, v261); + float32x2_t v274 = vadd_f32(v259, v267); + float32x2_t v275 = vsub_f32(v259, v267); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v294), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v342), 0); + float32x2_t v254 = vadd_f32(v250, v251); + float32x2_t v255 = vadd_f32(v252, v253); + float32x2_t v256 = vsub_f32(v252, v253); + float32x2_t v257 = vsub_f32(v250, v251); + float32x2_t v278 = vadd_f32(v262, v260); + float32x2_t v279 = vsub_f32(v262, v260); + float32x2_t v280 = vadd_f32(v263, v265); + float32x2_t v281 = vsub_f32(v263, v265); + float32x2_t v282 = vadd_f32(v263, v261); + float32x2_t v283 = vsub_f32(v263, v261); + int16x4_t v318 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v249, 15), (int32x2_t){0, 0})); + int16x4_t v366 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v248, 15), (int32x2_t){0, 0})); + float32x2_t v284 = vadd_f32(v268, v278); + float32x2_t v285 = vadd_f32(v269, v279); + float32x2_t v286 = vsub_f32(v270, v279); + float32x2_t v287 = vsub_f32(v271, v278); + float32x2_t v288 = vadd_f32(v272, v280); + float32x2_t v289 = vadd_f32(v273, v281); + float32x2_t v290 = vsub_f32(v274, v283); + float32x2_t v291 = vsub_f32(v275, v282); + int16x4_t v306 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v257, 15), (int32x2_t){0, 0})); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v318), 0); + int16x4_t v330 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v256, 15), (int32x2_t){0, 0})); + int16x4_t v354 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v255, 15), (int32x2_t){0, 0})); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v366), 0); + int16x4_t v378 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v254, 15), (int32x2_t){0, 0})); + int16x4_t v300 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v287, 15), (int32x2_t){0, 0})); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v306), 0); + int16x4_t v312 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v290, 15), (int32x2_t){0, 0})); + int16x4_t v324 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v291, 15), (int32x2_t){0, 0})); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v330), 0); + int16x4_t v336 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v286, 15), (int32x2_t){0, 0})); + int16x4_t v348 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v285, 15), (int32x2_t){0, 0})); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v354), 0); + int16x4_t v360 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v288, 15), (int32x2_t){0, 0})); + int16x4_t v372 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v289, 15), (int32x2_t){0, 0})); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v378), 0); + int16x4_t v384 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v284, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v300), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v312), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v324), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v336), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v348), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v360), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v372), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v384), 0); + v5 += 1 * 1; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu16(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * 1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v238 = -1.0000000000000000e+00F; + float v245 = -7.0710678118654746e-01F; + float v252 = 7.0710678118654757e-01F; + float v257 = -9.2387953251128674e-01F; + float v264 = 5.4119610014619690e-01F; + float v271 = -1.3065629648763766e+00F; + float v278 = 3.8268343236508984e-01F; + float v283 = 1.3065629648763766e+00F; + float v288 = -5.4119610014619690e-01F; + const int32_t *v543 = &v5[v0]; + int32_t *v643 = &v6[v2]; + int64_t v27 = v0 * 8; + int64_t v37 = v0 * 4; + int64_t v45 = v0 * 12; + int64_t v55 = v0 * 2; + int64_t v63 = v0 * 10; + int64_t v73 = v0 * 6; + int64_t v81 = v0 * 14; + int64_t v99 = v0 * 9; + int64_t v109 = v0 * 5; + int64_t v117 = v0 * 13; + int64_t v127 = v0 * 3; + int64_t v135 = v0 * 11; + int64_t v145 = v0 * 7; + int64_t v153 = v0 * 15; + float v241 = v4 * v238; + float v248 = v4 * v245; + float v260 = v4 * v257; + float v267 = v4 * v264; + float v274 = v4 * v271; + int64_t v353 = v2 * 2; + int64_t v361 = v2 * 3; + int64_t v369 = v2 * 4; + int64_t v377 = v2 * 5; + int64_t v385 = v2 * 6; + int64_t v393 = v2 * 7; + int64_t v401 = v2 * 8; + int64_t v409 = v2 * 9; + int64_t v417 = v2 * 10; + int64_t v425 = v2 * 11; + int64_t v433 = v2 * 12; + int64_t v441 = v2 * 13; + int64_t v449 = v2 * 14; + int64_t v457 = v2 * 15; + const int32_t *v471 = &v5[0]; + svfloat32_t v620 = svdup_n_f32(v252); + svfloat32_t v624 = svdup_n_f32(v278); + svfloat32_t v625 = svdup_n_f32(v283); + svfloat32_t v626 = svdup_n_f32(v288); + int32_t *v634 = &v6[0]; + svfloat32_t v97 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v543[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v480 = &v5[v27]; + const int32_t *v489 = &v5[v37]; + const int32_t *v498 = &v5[v45]; + const int32_t *v507 = &v5[v55]; + const int32_t *v516 = &v5[v63]; + const int32_t *v525 = &v5[v73]; + const int32_t *v534 = &v5[v81]; + const int32_t *v552 = &v5[v99]; + const int32_t *v561 = &v5[v109]; + const int32_t *v570 = &v5[v117]; + const int32_t *v579 = &v5[v127]; + const int32_t *v588 = &v5[v135]; + const int32_t *v597 = &v5[v145]; + const int32_t *v606 = &v5[v153]; + svfloat32_t v618 = svdup_n_f32(v241); + svfloat32_t v619 = svdup_n_f32(v248); + svfloat32_t v621 = svdup_n_f32(v260); + svfloat32_t v622 = svdup_n_f32(v267); + svfloat32_t v623 = svdup_n_f32(v274); + int32_t *v652 = &v6[v353]; + int32_t *v661 = &v6[v361]; + int32_t *v670 = &v6[v369]; + int32_t *v679 = &v6[v377]; + int32_t *v688 = &v6[v385]; + int32_t *v697 = &v6[v393]; + int32_t *v706 = &v6[v401]; + int32_t *v715 = &v6[v409]; + int32_t *v724 = &v6[v417]; + int32_t *v733 = &v6[v425]; + int32_t *v742 = &v6[v433]; + int32_t *v751 = &v6[v441]; + int32_t *v760 = &v6[v449]; + int32_t *v769 = &v6[v457]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v471[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v480[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v489[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v51 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v498[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v61 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v507[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v69 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v516[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v79 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v525[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v87 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v534[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v105 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v552[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v115 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v561[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v123 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v570[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v133 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v579[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v141 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v588[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v151 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v597[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v159 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v606[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v70 = svadd_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v71 = svsub_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v106 = svadd_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v151, v159); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v151, v159); + svfloat32_t v162 = svadd_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v163 = svsub_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v164 = svadd_f32_x(svptrue_b32(), v70, v88); + svfloat32_t v165 = svsub_f32_x(svptrue_b32(), v70, v88); + svfloat32_t v166 = svadd_f32_x(svptrue_b32(), v106, v124); + svfloat32_t v167 = svsub_f32_x(svptrue_b32(), v106, v124); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v142, v160); + svfloat32_t v169 = svsub_f32_x(svptrue_b32(), v142, v160); + svfloat32_t v178 = svadd_f32_x(svptrue_b32(), v71, v89); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v71, v89); + svfloat32_t v180 = svadd_f32_x(svptrue_b32(), v107, v161); + svfloat32_t v181 = svsub_f32_x(svptrue_b32(), v107, v161); + svfloat32_t v182 = svadd_f32_x(svptrue_b32(), v125, v143); + svfloat32_t v183 = svsub_f32_x(svptrue_b32(), v125, v143); + svfloat32_t zero243 = svdup_n_f32(0); + svfloat32_t v243 = svcmla_f32_x(pred_full, zero243, v618, v53, 90); + svfloat32_t v170 = svadd_f32_x(svptrue_b32(), v162, v164); + svfloat32_t v171 = svsub_f32_x(svptrue_b32(), v162, v164); + svfloat32_t v172 = svadd_f32_x(svptrue_b32(), v166, v168); + svfloat32_t v173 = svsub_f32_x(svptrue_b32(), v166, v168); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v167, v169); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v167, v169); + svfloat32_t v184 = svadd_f32_x(svptrue_b32(), v180, v182); + svfloat32_t v185 = svadd_f32_x(svptrue_b32(), v181, v183); + svfloat32_t zero219 = svdup_n_f32(0); + svfloat32_t v219 = svcmla_f32_x(pred_full, zero219, v618, v165, 90); + svfloat32_t zero250 = svdup_n_f32(0); + svfloat32_t v250 = svcmla_f32_x(pred_full, zero250, v619, v178, 90); + svfloat32_t zero276 = svdup_n_f32(0); + svfloat32_t v276 = svcmla_f32_x(pred_full, zero276, v623, v182, 90); + svfloat32_t v286 = svmul_f32_x(svptrue_b32(), v181, v625); + svfloat32_t v291 = svmul_f32_x(svptrue_b32(), v183, v626); + svfloat32_t v174 = svadd_f32_x(svptrue_b32(), v170, v172); + svfloat32_t v175 = svsub_f32_x(svptrue_b32(), v170, v172); + svfloat32_t zero207 = svdup_n_f32(0); + svfloat32_t v207 = svcmla_f32_x(pred_full, zero207, v618, v173, 90); + svfloat32_t zero226 = svdup_n_f32(0); + svfloat32_t v226 = svcmla_f32_x(pred_full, zero226, v619, v176, 90); + svfloat32_t zero262 = svdup_n_f32(0); + svfloat32_t v262 = svcmla_f32_x(pred_full, zero262, v621, v184, 90); + svfloat32_t v281 = svmul_f32_x(svptrue_b32(), v185, v624); + svfloat32_t v302 = svmla_f32_x(pred_full, v35, v179, v620); + svfloat32_t v303 = svmls_f32_x(pred_full, v35, v179, v620); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v243, v250); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v243, v250); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v171, v207); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v171, v207); + svfloat32_t v294 = svmla_f32_x(pred_full, v163, v177, v620); + svfloat32_t v295 = svadd_f32_x(svptrue_b32(), v219, v226); + svfloat32_t v296 = svmls_f32_x(pred_full, v163, v177, v620); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v226, v219); + svfloat32_t v306 = svcmla_f32_x(pred_full, v262, v622, v180, 90); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v262, v276); + svfloat32_t v308 = svnmls_f32_x(pred_full, v281, v181, v625); + svfloat32_t v309 = svnmls_f32_x(pred_full, v281, v183, v626); + svfloat32_t v310 = svnmls_f32_x(pred_full, v286, v185, v624); + svfloat32_t v311 = svnmls_f32_x(pred_full, v291, v185, v624); + svfloat32_t v316 = svadd_f32_x(svptrue_b32(), v303, v305); + svfloat32_t v317 = svsub_f32_x(svptrue_b32(), v303, v305); + svint16_t v338 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v174, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v402 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v175, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v294, v295); + svfloat32_t v299 = svadd_f32_x(svptrue_b32(), v296, v297); + svfloat32_t v300 = svsub_f32_x(svptrue_b32(), v296, v297); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v294, v295); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v302, v308); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v302, v308); + svfloat32_t v314 = svadd_f32_x(svptrue_b32(), v302, v310); + svfloat32_t v315 = svsub_f32_x(svptrue_b32(), v302, v310); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v303, v311); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v303, v311); + svfloat32_t v322 = svadd_f32_x(svptrue_b32(), v306, v304); + svfloat32_t v323 = svsub_f32_x(svptrue_b32(), v306, v304); + svfloat32_t v324 = svadd_f32_x(svptrue_b32(), v307, v309); + svfloat32_t v325 = svsub_f32_x(svptrue_b32(), v307, v309); + svfloat32_t v326 = svadd_f32_x(svptrue_b32(), v307, v305); + svfloat32_t v327 = svsub_f32_x(svptrue_b32(), v307, v305); + svint16_t v370 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v293, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v434 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v292, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v634), svreinterpret_u64_s16(v338)); + svst1w_u64(pred_full, (unsigned *)(v706), svreinterpret_u64_s16(v402)); + svfloat32_t v328 = svadd_f32_x(svptrue_b32(), v312, v322); + svfloat32_t v329 = svadd_f32_x(svptrue_b32(), v313, v323); + svfloat32_t v330 = svsub_f32_x(svptrue_b32(), v314, v323); + svfloat32_t v331 = svsub_f32_x(svptrue_b32(), v315, v322); + svfloat32_t v332 = svadd_f32_x(svptrue_b32(), v316, v324); + svfloat32_t v333 = svadd_f32_x(svptrue_b32(), v317, v325); + svfloat32_t v334 = svsub_f32_x(svptrue_b32(), v318, v327); + svfloat32_t v335 = svsub_f32_x(svptrue_b32(), v319, v326); + svint16_t v354 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v301, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v386 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v300, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v418 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v299, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v450 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v298, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v670), svreinterpret_u64_s16(v370)); + svst1w_u64(pred_full, (unsigned *)(v742), svreinterpret_u64_s16(v434)); + svint16_t v346 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v331, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v362 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v334, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v378 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v335, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v394 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v330, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v410 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v329, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v426 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v332, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v442 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v333, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v458 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v328, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v652), svreinterpret_u64_s16(v354)); + svst1w_u64(pred_full, (unsigned *)(v688), svreinterpret_u64_s16(v386)); + svst1w_u64(pred_full, (unsigned *)(v724), svreinterpret_u64_s16(v418)); + svst1w_u64(pred_full, (unsigned *)(v760), svreinterpret_u64_s16(v450)); + svst1w_u64(pred_full, (unsigned *)(v643), svreinterpret_u64_s16(v346)); + svst1w_u64(pred_full, (unsigned *)(v661), svreinterpret_u64_s16(v362)); + svst1w_u64(pred_full, (unsigned *)(v679), svreinterpret_u64_s16(v378)); + svst1w_u64(pred_full, (unsigned *)(v697), svreinterpret_u64_s16(v394)); + svst1w_u64(pred_full, (unsigned *)(v715), svreinterpret_u64_s16(v410)); + svst1w_u64(pred_full, (unsigned *)(v733), svreinterpret_u64_s16(v426)); + svst1w_u64(pred_full, (unsigned *)(v751), svreinterpret_u64_s16(v442)); + svst1w_u64(pred_full, (unsigned *)(v769), svreinterpret_u64_s16(v458)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu17(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v20 = vld1s_s16(&v5[istride]); + float v190 = -4.2602849117736000e-02F; + float v194 = 2.0497965023262180e-01F; + float v198 = 1.0451835201736759e+00F; + float v202 = 1.7645848660222969e+00F; + float v206 = -7.2340797728605655e-01F; + float v210 = -8.9055591620606403e-02F; + float v214 = -1.0625000000000000e+00F; + float v218 = 2.5769410160110379e-01F; + float v222 = 7.7980260789483757e-01F; + float v226 = 5.4389318464570580e-01F; + float v230 = 4.2010193497052700e-01F; + float v234 = 1.2810929434228073e+00F; + float v238 = 4.4088907348175338e-01F; + float v242 = 3.1717619283272508e-01F; + float v245 = -9.0138318648016680e-01F; + float v246 = 9.0138318648016680e-01F; + float v252 = -4.3248756360072310e-01F; + float v253 = 4.3248756360072310e-01F; + float v259 = 6.6693537504044498e-01F; + float v260 = -6.6693537504044498e-01F; + float v266 = -6.0389004312516970e-01F; + float v267 = 6.0389004312516970e-01F; + float v273 = -3.6924873198582547e-01F; + float v274 = 3.6924873198582547e-01F; + float v280 = 4.8656938755549761e-01F; + float v281 = -4.8656938755549761e-01F; + float v287 = 2.3813712136760609e-01F; + float v288 = -2.3813712136760609e-01F; + float v294 = -1.5573820617422458e+00F; + float v295 = 1.5573820617422458e+00F; + float v301 = 6.5962247018731990e-01F; + float v302 = -6.5962247018731990e-01F; + float v308 = -1.4316961569866241e-01F; + float v309 = 1.4316961569866241e-01F; + float v315 = 2.3903469959860771e-01F; + float v316 = -2.3903469959860771e-01F; + float v322 = -4.7932541949972603e-02F; + float v323 = 4.7932541949972603e-02F; + float v329 = -2.3188014856550065e+00F; + float v330 = 2.3188014856550065e+00F; + float v336 = 7.8914568419206255e-01F; + float v337 = -7.8914568419206255e-01F; + float v343 = 3.8484572871179505e+00F; + float v344 = -3.8484572871179505e+00F; + float v350 = -1.3003804568801376e+00F; + float v351 = 1.3003804568801376e+00F; + float v357 = 4.0814769046889037e+00F; + float v358 = -4.0814769046889037e+00F; + float v364 = -1.4807159909286283e+00F; + float v365 = 1.4807159909286283e+00F; + float v371 = -1.3332470363551400e-02F; + float v372 = 1.3332470363551400e-02F; + float v378 = -3.7139778690557629e-01F; + float v379 = 3.7139778690557629e-01F; + float v385 = 1.9236512863456379e-01F; + float v386 = -1.9236512863456379e-01F; + float32x2_t v388 = (float32x2_t){v4, v4}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v182 = vld1s_s16(&v5[0]); + float32x2_t v191 = (float32x2_t){v190, v190}; + float32x2_t v195 = (float32x2_t){v194, v194}; + float32x2_t v199 = (float32x2_t){v198, v198}; + float32x2_t v203 = (float32x2_t){v202, v202}; + float32x2_t v207 = (float32x2_t){v206, v206}; + float32x2_t v211 = (float32x2_t){v210, v210}; + float32x2_t v215 = (float32x2_t){v214, v214}; + float32x2_t v219 = (float32x2_t){v218, v218}; + float32x2_t v223 = (float32x2_t){v222, v222}; + float32x2_t v227 = (float32x2_t){v226, v226}; + float32x2_t v231 = (float32x2_t){v230, v230}; + float32x2_t v235 = (float32x2_t){v234, v234}; + float32x2_t v239 = (float32x2_t){v238, v238}; + float32x2_t v243 = (float32x2_t){v242, v242}; + float32x2_t v247 = (float32x2_t){v245, v246}; + float32x2_t v254 = (float32x2_t){v252, v253}; + float32x2_t v261 = (float32x2_t){v259, v260}; + float32x2_t v268 = (float32x2_t){v266, v267}; + float32x2_t v275 = (float32x2_t){v273, v274}; + float32x2_t v282 = (float32x2_t){v280, v281}; + float32x2_t v289 = (float32x2_t){v287, v288}; + float32x2_t v296 = (float32x2_t){v294, v295}; + float32x2_t v303 = (float32x2_t){v301, v302}; + float32x2_t v310 = (float32x2_t){v308, v309}; + float32x2_t v317 = (float32x2_t){v315, v316}; + float32x2_t v324 = (float32x2_t){v322, v323}; + float32x2_t v331 = (float32x2_t){v329, v330}; + float32x2_t v338 = (float32x2_t){v336, v337}; + float32x2_t v345 = (float32x2_t){v343, v344}; + float32x2_t v352 = (float32x2_t){v350, v351}; + float32x2_t v359 = (float32x2_t){v357, v358}; + float32x2_t v366 = (float32x2_t){v364, v365}; + float32x2_t v373 = (float32x2_t){v371, v372}; + float32x2_t v380 = (float32x2_t){v378, v379}; + float32x2_t v387 = (float32x2_t){v385, v386}; + int16x4_t v26 = vld1s_s16(&v5[istride * 16]); + int16x4_t v34 = vld1s_s16(&v5[istride * 3]); + int16x4_t v40 = vld1s_s16(&v5[istride * 14]); + int16x4_t v48 = vld1s_s16(&v5[istride * 9]); + int16x4_t v54 = vld1s_s16(&v5[istride * 8]); + int16x4_t v62 = vld1s_s16(&v5[istride * 10]); + int16x4_t v68 = vld1s_s16(&v5[istride * 7]); + int16x4_t v76 = vld1s_s16(&v5[istride * 13]); + int16x4_t v82 = vld1s_s16(&v5[istride * 4]); + int16x4_t v90 = vld1s_s16(&v5[istride * 5]); + int16x4_t v96 = vld1s_s16(&v5[istride * 12]); + int16x4_t v104 = vld1s_s16(&v5[istride * 15]); + int16x4_t v110 = vld1s_s16(&v5[istride * 2]); + int16x4_t v118 = vld1s_s16(&v5[istride * 11]); + int16x4_t v124 = vld1s_s16(&v5[istride * 6]); + float32x2_t v183 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v182)), 15); + float32x2_t v249 = vmul_f32(v388, v247); + float32x2_t v256 = vmul_f32(v388, v254); + float32x2_t v263 = vmul_f32(v388, v261); + float32x2_t v270 = vmul_f32(v388, v268); + float32x2_t v277 = vmul_f32(v388, v275); + float32x2_t v284 = vmul_f32(v388, v282); + float32x2_t v291 = vmul_f32(v388, v289); + float32x2_t v298 = vmul_f32(v388, v296); + float32x2_t v305 = vmul_f32(v388, v303); + float32x2_t v312 = vmul_f32(v388, v310); + float32x2_t v319 = vmul_f32(v388, v317); + float32x2_t v326 = vmul_f32(v388, v324); + float32x2_t v333 = vmul_f32(v388, v331); + float32x2_t v340 = vmul_f32(v388, v338); + float32x2_t v347 = vmul_f32(v388, v345); + float32x2_t v354 = vmul_f32(v388, v352); + float32x2_t v361 = vmul_f32(v388, v359); + float32x2_t v368 = vmul_f32(v388, v366); + float32x2_t v375 = vmul_f32(v388, v373); + float32x2_t v382 = vmul_f32(v388, v380); + float32x2_t v389 = vmul_f32(v388, v387); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v49 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v48)), 15); + float32x2_t v55 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v54)), 15); + float32x2_t v63 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v62)), 15); + float32x2_t v69 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v68)), 15); + float32x2_t v77 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v76)), 15); + float32x2_t v83 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v82)), 15); + float32x2_t v91 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v90)), 15); + float32x2_t v97 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v96)), 15); + float32x2_t v105 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v104)), 15); + float32x2_t v111 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v110)), 15); + float32x2_t v119 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v118)), 15); + float32x2_t v125 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v124)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v56 = vadd_f32(v49, v55); + float32x2_t v57 = vsub_f32(v49, v55); + float32x2_t v70 = vadd_f32(v63, v69); + float32x2_t v71 = vsub_f32(v63, v69); + float32x2_t v84 = vadd_f32(v77, v83); + float32x2_t v85 = vsub_f32(v77, v83); + float32x2_t v98 = vadd_f32(v91, v97); + float32x2_t v99 = vsub_f32(v91, v97); + float32x2_t v112 = vadd_f32(v105, v111); + float32x2_t v113 = vsub_f32(v105, v111); + float32x2_t v126 = vadd_f32(v119, v125); + float32x2_t v127 = vsub_f32(v119, v125); + float32x2_t v128 = vadd_f32(v28, v84); + float32x2_t v129 = vadd_f32(v42, v98); + float32x2_t v130 = vadd_f32(v56, v112); + float32x2_t v131 = vadd_f32(v70, v126); + float32x2_t v134 = vsub_f32(v28, v84); + float32x2_t v135 = vsub_f32(v42, v98); + float32x2_t v136 = vsub_f32(v56, v112); + float32x2_t v137 = vsub_f32(v70, v126); + float32x2_t v148 = vadd_f32(v29, v57); + float32x2_t v149 = vadd_f32(v43, v71); + float32x2_t v150 = vsub_f32(v29, v57); + float32x2_t v151 = vsub_f32(v127, v99); + float32x2_t v152 = vadd_f32(v85, v113); + float32x2_t v153 = vadd_f32(v99, v127); + float32x2_t v154 = vsub_f32(v85, v113); + float32x2_t v155 = vsub_f32(v43, v71); + float32x2_t v168 = vadd_f32(v29, v85); + float32x2_t v169 = vadd_f32(v71, v127); + float32x2_t v341 = vrev64_f32(v29); + float32x2_t v348 = vrev64_f32(v85); + float32x2_t v362 = vrev64_f32(v71); + float32x2_t v369 = vrev64_f32(v127); + float32x2_t v132 = vadd_f32(v128, v130); + float32x2_t v133 = vadd_f32(v129, v131); + float32x2_t v138 = vsub_f32(v128, v130); + float32x2_t v139 = vsub_f32(v129, v131); + float32x2_t v142 = vadd_f32(v135, v137); + float32x2_t v143 = vadd_f32(v134, v136); + float32x2_t v145 = vsub_f32(v136, v137); + float32x2_t v146 = vsub_f32(v134, v135); + float32x2_t v156 = vadd_f32(v148, v149); + float32x2_t v157 = vadd_f32(v152, v153); + float32x2_t v159 = vsub_f32(v148, v149); + float32x2_t v160 = vsub_f32(v152, v153); + float32x2_t v162 = vadd_f32(v150, v151); + float32x2_t v163 = vadd_f32(v154, v155); + float32x2_t v165 = vsub_f32(v150, v151); + float32x2_t v166 = vsub_f32(v154, v155); + float32x2_t v192 = vmul_f32(v134, v191); + float32x2_t v196 = vmul_f32(v135, v195); + float32x2_t v200 = vmul_f32(v136, v199); + float32x2_t v204 = vmul_f32(v137, v203); + float32x2_t v334 = vrev64_f32(v168); + float32x2_t v342 = vmul_f32(v341, v340); + float32x2_t v349 = vmul_f32(v348, v347); + float32x2_t v355 = vrev64_f32(v169); + float32x2_t v363 = vmul_f32(v362, v361); + float32x2_t v370 = vmul_f32(v369, v368); + float32x2_t v140 = vadd_f32(v132, v133); + float32x2_t v141 = vsub_f32(v132, v133); + float32x2_t v144 = vsub_f32(v143, v142); + float32x2_t v147 = vadd_f32(v138, v139); + float32x2_t v158 = vadd_f32(v156, v157); + float32x2_t v161 = vadd_f32(v159, v160); + float32x2_t v164 = vadd_f32(v162, v163); + float32x2_t v167 = vadd_f32(v165, v166); + float32x2_t v170 = vsub_f32(v163, v157); + float32x2_t v173 = vsub_f32(v156, v162); + float32x2_t v208 = vmul_f32(v138, v207); + float32x2_t v212 = vmul_f32(v139, v211); + float32x2_t v224 = vmul_f32(v142, v223); + float32x2_t v228 = vmul_f32(v143, v227); + float32x2_t v236 = vmul_f32(v145, v235); + float32x2_t v240 = vmul_f32(v146, v239); + float32x2_t v250 = vrev64_f32(v156); + float32x2_t v257 = vrev64_f32(v157); + float32x2_t v271 = vrev64_f32(v159); + float32x2_t v278 = vrev64_f32(v160); + float32x2_t v292 = vrev64_f32(v162); + float32x2_t v299 = vrev64_f32(v163); + float32x2_t v313 = vrev64_f32(v165); + float32x2_t v320 = vrev64_f32(v166); + float32x2_t v335 = vmul_f32(v334, v333); + float32x2_t v356 = vmul_f32(v355, v354); + float32x2_t v171 = vadd_f32(v170, v29); + float32x2_t v174 = vadd_f32(v173, v71); + float32x2_t v184 = vadd_f32(v183, v140); + float32x2_t v216 = vmul_f32(v140, v215); + float32x2_t v220 = vmul_f32(v141, v219); + float32x2_t v232 = vmul_f32(v144, v231); + float32x2_t v244 = vmul_f32(v147, v243); + float32x2_t v251 = vmul_f32(v250, v249); + float32x2_t v258 = vmul_f32(v257, v256); + float32x2_t v264 = vrev64_f32(v158); + float32x2_t v272 = vmul_f32(v271, v270); + float32x2_t v279 = vmul_f32(v278, v277); + float32x2_t v285 = vrev64_f32(v161); + float32x2_t v293 = vmul_f32(v292, v291); + float32x2_t v300 = vmul_f32(v299, v298); + float32x2_t v306 = vrev64_f32(v164); + float32x2_t v314 = vmul_f32(v313, v312); + float32x2_t v321 = vmul_f32(v320, v319); + float32x2_t v327 = vrev64_f32(v167); + float32x2_t v394 = vadd_f32(v204, v236); + float32x2_t v395 = vsub_f32(v236, v200); + float32x2_t v396 = vadd_f32(v196, v240); + float32x2_t v397 = vsub_f32(v192, v240); + float32x2_t v172 = vsub_f32(v171, v169); + float32x2_t v175 = vadd_f32(v174, v85); + float32x2_t v265 = vmul_f32(v264, v263); + float32x2_t v286 = vmul_f32(v285, v284); + float32x2_t v307 = vmul_f32(v306, v305); + float32x2_t v328 = vmul_f32(v327, v326); + float32x2_t v392 = vadd_f32(v224, v232); + float32x2_t v393 = vsub_f32(v228, v232); + float32x2_t v398 = vsub_f32(v244, v212); + float32x2_t v399 = vadd_f32(v244, v208); + float32x2_t v400 = vadd_f32(v216, v184); + int16x4_t v468 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v184, 15), (int32x2_t){0, 0})); + float32x2_t v176 = vsub_f32(v175, v127); + float32x2_t v376 = vrev64_f32(v172); + float32x2_t v401 = vadd_f32(v220, v400); + float32x2_t v402 = vsub_f32(v400, v220); + float32x2_t v403 = vsub_f32(v392, v394); + float32x2_t v405 = vadd_f32(v393, v395); + float32x2_t v407 = vadd_f32(v392, v396); + float32x2_t v409 = vadd_f32(v393, v397); + float32x2_t v419 = vadd_f32(v251, v265); + float32x2_t v420 = vadd_f32(v258, v265); + float32x2_t v421 = vadd_f32(v272, v286); + float32x2_t v422 = vadd_f32(v279, v286); + float32x2_t v423 = vadd_f32(v293, v307); + float32x2_t v424 = vadd_f32(v300, v307); + float32x2_t v425 = vadd_f32(v314, v328); + float32x2_t v426 = vadd_f32(v321, v328); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v468), 0); + float32x2_t v177 = vadd_f32(v172, v176); + float32x2_t v377 = vmul_f32(v376, v375); + float32x2_t v383 = vrev64_f32(v176); + float32x2_t v404 = vadd_f32(v398, v401); + float32x2_t v406 = vadd_f32(v399, v402); + float32x2_t v408 = vsub_f32(v401, v398); + float32x2_t v410 = vsub_f32(v402, v399); + float32x2_t v430 = vadd_f32(v419, v421); + float32x2_t v431 = vsub_f32(v419, v421); + float32x2_t v432 = vadd_f32(v420, v422); + float32x2_t v433 = vsub_f32(v420, v422); + float32x2_t v434 = vadd_f32(v423, v425); + float32x2_t v435 = vsub_f32(v425, v423); + float32x2_t v436 = vadd_f32(v424, v426); + float32x2_t v437 = vsub_f32(v426, v424); + float32x2_t v384 = vmul_f32(v383, v382); + float32x2_t v390 = vrev64_f32(v177); + float32x2_t v411 = vadd_f32(v403, v404); + float32x2_t v412 = vadd_f32(v405, v406); + float32x2_t v413 = vadd_f32(v407, v408); + float32x2_t v414 = vadd_f32(v409, v410); + float32x2_t v415 = vsub_f32(v404, v403); + float32x2_t v416 = vsub_f32(v406, v405); + float32x2_t v417 = vsub_f32(v408, v407); + float32x2_t v418 = vsub_f32(v410, v409); + float32x2_t v447 = vadd_f32(v432, v436); + float32x2_t v449 = vadd_f32(v431, v437); + float32x2_t v451 = vsub_f32(v430, v434); + float32x2_t v453 = vsub_f32(v437, v431); + float32x2_t v455 = vadd_f32(v430, v434); + float32x2_t v458 = vsub_f32(v435, v433); + float32x2_t v461 = vsub_f32(v436, v432); + float32x2_t v464 = vadd_f32(v433, v435); + float32x2_t v391 = vmul_f32(v390, v389); + float32x2_t v438 = vsub_f32(v377, v384); + float32x2_t v427 = vadd_f32(v391, v384); + float32x2_t v440 = vadd_f32(v438, v438); + float32x2_t v465 = vsub_f32(v464, v438); + float32x2_t v428 = vadd_f32(v335, v427); + float32x2_t v441 = vsub_f32(v356, v440); + float32x2_t v444 = vadd_f32(v427, v427); + float32x2_t v462 = vadd_f32(v461, v440); + float32x2_t v500 = vadd_f32(v418, v465); + float32x2_t v507 = vsub_f32(v418, v465); + float32x2_t v429 = vadd_f32(v428, v342); + float32x2_t v439 = vadd_f32(v428, v349); + float32x2_t v442 = vadd_f32(v441, v363); + float32x2_t v443 = vadd_f32(v441, v370); + float32x2_t v445 = vadd_f32(v444, v444); + float32x2_t v446 = vadd_f32(v438, v444); + float32x2_t v452 = vadd_f32(v451, v444); + float32x2_t v463 = vadd_f32(v462, v444); + int16x4_t v503 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v500, 15), (int32x2_t){0, 0})); + int16x4_t v510 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v507, 15), (int32x2_t){0, 0})); + float32x2_t v448 = vadd_f32(v447, v439); + float32x2_t v450 = vadd_f32(v449, v442); + float32x2_t v454 = vsub_f32(v453, v446); + float32x2_t v456 = vadd_f32(v455, v429); + float32x2_t v459 = vsub_f32(v458, v443); + float32x2_t v486 = vadd_f32(v413, v452); + float32x2_t v493 = vsub_f32(v413, v452); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v503), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v510), 0); + float32x2_t v570 = vadd_f32(v417, v463); + float32x2_t v577 = vsub_f32(v417, v463); + float32x2_t v457 = vadd_f32(v456, v438); + float32x2_t v460 = vadd_f32(v459, v445); + float32x2_t v472 = vadd_f32(v411, v448); + float32x2_t v479 = vsub_f32(v411, v448); + int16x4_t v489 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v486, 15), (int32x2_t){0, 0})); + int16x4_t v496 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v493, 15), (int32x2_t){0, 0})); + float32x2_t v528 = vadd_f32(v414, v454); + float32x2_t v535 = vsub_f32(v414, v454); + float32x2_t v542 = vadd_f32(v412, v450); + float32x2_t v549 = vsub_f32(v412, v450); + int16x4_t v573 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v570, 15), (int32x2_t){0, 0})); + int16x4_t v580 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v577, 15), (int32x2_t){0, 0})); + int16x4_t v475 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v472, 15), (int32x2_t){0, 0})); + int16x4_t v482 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v479, 15), (int32x2_t){0, 0})); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v489), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v496), 0); + float32x2_t v514 = vadd_f32(v415, v457); + float32x2_t v521 = vsub_f32(v415, v457); + int16x4_t v531 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v528, 15), (int32x2_t){0, 0})); + int16x4_t v538 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v535, 15), (int32x2_t){0, 0})); + int16x4_t v545 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v542, 15), (int32x2_t){0, 0})); + int16x4_t v552 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v549, 15), (int32x2_t){0, 0})); + float32x2_t v556 = vadd_f32(v416, v460); + float32x2_t v563 = vsub_f32(v416, v460); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v573), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v580), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v475), 0); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v482), 0); + int16x4_t v517 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v514, 15), (int32x2_t){0, 0})); + int16x4_t v524 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v521, 15), (int32x2_t){0, 0})); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v531), 0); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v538), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v545), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v552), 0); + int16x4_t v559 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v556, 15), (int32x2_t){0, 0})); + int16x4_t v566 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v563, 15), (int32x2_t){0, 0})); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v517), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v524), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v559), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v566), 0); + v5 += 1 * 1; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu17(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * 1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v227 = -4.2602849117736000e-02F; + float v232 = 2.0497965023262180e-01F; + float v237 = 1.0451835201736759e+00F; + float v242 = 1.7645848660222969e+00F; + float v247 = -7.2340797728605655e-01F; + float v252 = -8.9055591620606403e-02F; + float v257 = -1.0625000000000000e+00F; + float v262 = 2.5769410160110379e-01F; + float v267 = 7.7980260789483757e-01F; + float v272 = 5.4389318464570580e-01F; + float v277 = 4.2010193497052700e-01F; + float v282 = 1.2810929434228073e+00F; + float v287 = 4.4088907348175338e-01F; + float v292 = 3.1717619283272508e-01F; + float v297 = 9.0138318648016680e-01F; + float v304 = 4.3248756360072310e-01F; + float v311 = -6.6693537504044498e-01F; + float v318 = 6.0389004312516970e-01F; + float v325 = 3.6924873198582547e-01F; + float v332 = -4.8656938755549761e-01F; + float v339 = -2.3813712136760609e-01F; + float v346 = 1.5573820617422458e+00F; + float v353 = -6.5962247018731990e-01F; + float v360 = 1.4316961569866241e-01F; + float v367 = -2.3903469959860771e-01F; + float v374 = 4.7932541949972603e-02F; + float v381 = 2.3188014856550065e+00F; + float v388 = -7.8914568419206255e-01F; + float v395 = -3.8484572871179505e+00F; + float v402 = 1.3003804568801376e+00F; + float v409 = -4.0814769046889037e+00F; + float v416 = 1.4807159909286283e+00F; + float v423 = 1.3332470363551400e-02F; + float v430 = 3.7139778690557629e-01F; + float v437 = -1.9236512863456379e-01F; + const int32_t *v675 = &v5[v0]; + int32_t *v875 = &v6[v2]; + int64_t v27 = v0 * 16; + int64_t v37 = v0 * 3; + int64_t v45 = v0 * 14; + int64_t v55 = v0 * 9; + int64_t v63 = v0 * 8; + int64_t v73 = v0 * 10; + int64_t v81 = v0 * 7; + int64_t v91 = v0 * 13; + int64_t v99 = v0 * 4; + int64_t v109 = v0 * 5; + int64_t v117 = v0 * 12; + int64_t v127 = v0 * 15; + int64_t v135 = v0 * 2; + int64_t v145 = v0 * 11; + int64_t v153 = v0 * 6; + float v300 = v4 * v297; + float v307 = v4 * v304; + float v314 = v4 * v311; + float v321 = v4 * v318; + float v328 = v4 * v325; + float v335 = v4 * v332; + float v342 = v4 * v339; + float v349 = v4 * v346; + float v356 = v4 * v353; + float v363 = v4 * v360; + float v370 = v4 * v367; + float v377 = v4 * v374; + float v384 = v4 * v381; + float v391 = v4 * v388; + float v398 = v4 * v395; + float v405 = v4 * v402; + float v412 = v4 * v409; + float v419 = v4 * v416; + float v426 = v4 * v423; + float v433 = v4 * v430; + float v440 = v4 * v437; + int64_t v536 = v2 * 16; + int64_t v545 = v2 * 2; + int64_t v554 = v2 * 15; + int64_t v563 = v2 * 3; + int64_t v572 = v2 * 14; + int64_t v581 = v2 * 4; + int64_t v590 = v2 * 13; + int64_t v599 = v2 * 5; + int64_t v608 = v2 * 12; + int64_t v617 = v2 * 6; + int64_t v626 = v2 * 11; + int64_t v635 = v2 * 7; + int64_t v644 = v2 * 10; + int64_t v653 = v2 * 8; + int64_t v662 = v2 * 9; + const int32_t *v820 = &v5[0]; + svfloat32_t v824 = svdup_n_f32(v227); + svfloat32_t v825 = svdup_n_f32(v232); + svfloat32_t v826 = svdup_n_f32(v237); + svfloat32_t v827 = svdup_n_f32(v242); + svfloat32_t v828 = svdup_n_f32(v247); + svfloat32_t v829 = svdup_n_f32(v252); + svfloat32_t v830 = svdup_n_f32(v257); + svfloat32_t v831 = svdup_n_f32(v262); + svfloat32_t v832 = svdup_n_f32(v267); + svfloat32_t v833 = svdup_n_f32(v272); + svfloat32_t v834 = svdup_n_f32(v277); + svfloat32_t v835 = svdup_n_f32(v282); + svfloat32_t v836 = svdup_n_f32(v287); + svfloat32_t v837 = svdup_n_f32(v292); + int32_t *v866 = &v6[0]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v675[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v684 = &v5[v27]; + const int32_t *v693 = &v5[v37]; + const int32_t *v702 = &v5[v45]; + const int32_t *v711 = &v5[v55]; + const int32_t *v720 = &v5[v63]; + const int32_t *v729 = &v5[v73]; + const int32_t *v738 = &v5[v81]; + const int32_t *v747 = &v5[v91]; + const int32_t *v756 = &v5[v99]; + const int32_t *v765 = &v5[v109]; + const int32_t *v774 = &v5[v117]; + const int32_t *v783 = &v5[v127]; + const int32_t *v792 = &v5[v135]; + const int32_t *v801 = &v5[v145]; + const int32_t *v810 = &v5[v153]; + svfloat32_t v838 = svdup_n_f32(v300); + svfloat32_t v839 = svdup_n_f32(v307); + svfloat32_t v840 = svdup_n_f32(v314); + svfloat32_t v841 = svdup_n_f32(v321); + svfloat32_t v842 = svdup_n_f32(v328); + svfloat32_t v843 = svdup_n_f32(v335); + svfloat32_t v844 = svdup_n_f32(v342); + svfloat32_t v845 = svdup_n_f32(v349); + svfloat32_t v846 = svdup_n_f32(v356); + svfloat32_t v847 = svdup_n_f32(v363); + svfloat32_t v848 = svdup_n_f32(v370); + svfloat32_t v849 = svdup_n_f32(v377); + svfloat32_t v850 = svdup_n_f32(v384); + svfloat32_t v851 = svdup_n_f32(v391); + svfloat32_t v852 = svdup_n_f32(v398); + svfloat32_t v853 = svdup_n_f32(v405); + svfloat32_t v854 = svdup_n_f32(v412); + svfloat32_t v855 = svdup_n_f32(v419); + svfloat32_t v856 = svdup_n_f32(v426); + svfloat32_t v857 = svdup_n_f32(v433); + svfloat32_t v858 = svdup_n_f32(v440); + int32_t *v884 = &v6[v536]; + int32_t *v893 = &v6[v545]; + int32_t *v902 = &v6[v554]; + int32_t *v911 = &v6[v563]; + int32_t *v920 = &v6[v572]; + int32_t *v929 = &v6[v581]; + int32_t *v938 = &v6[v590]; + int32_t *v947 = &v6[v599]; + int32_t *v956 = &v6[v608]; + int32_t *v965 = &v6[v617]; + int32_t *v974 = &v6[v626]; + int32_t *v983 = &v6[v635]; + int32_t *v992 = &v6[v644]; + int32_t *v1001 = &v6[v653]; + int32_t *v1010 = &v6[v662]; + svfloat32_t v219 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v820[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v684[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v693[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v51 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v702[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v61 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v711[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v69 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v720[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v79 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v729[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v87 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v738[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v97 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v747[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v105 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v756[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v115 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v765[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v123 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v774[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v133 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v783[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v141 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v792[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v151 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v801[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v159 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v810[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v70 = svadd_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v71 = svsub_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v106 = svadd_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v151, v159); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v151, v159); + svfloat32_t v162 = svadd_f32_x(svptrue_b32(), v34, v106); + svfloat32_t v163 = svadd_f32_x(svptrue_b32(), v52, v124); + svfloat32_t v164 = svadd_f32_x(svptrue_b32(), v70, v142); + svfloat32_t v165 = svadd_f32_x(svptrue_b32(), v88, v160); + svfloat32_t v168 = svsub_f32_x(svptrue_b32(), v34, v106); + svfloat32_t v169 = svsub_f32_x(svptrue_b32(), v52, v124); + svfloat32_t v170 = svsub_f32_x(svptrue_b32(), v70, v142); + svfloat32_t v171 = svsub_f32_x(svptrue_b32(), v88, v160); + svfloat32_t v182 = svadd_f32_x(svptrue_b32(), v35, v71); + svfloat32_t v183 = svadd_f32_x(svptrue_b32(), v53, v89); + svfloat32_t v184 = svsub_f32_x(svptrue_b32(), v35, v71); + svfloat32_t v185 = svsub_f32_x(svptrue_b32(), v161, v125); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v107, v143); + svfloat32_t v187 = svadd_f32_x(svptrue_b32(), v125, v161); + svfloat32_t v188 = svsub_f32_x(svptrue_b32(), v107, v143); + svfloat32_t v189 = svsub_f32_x(svptrue_b32(), v53, v89); + svfloat32_t v202 = svadd_f32_x(svptrue_b32(), v35, v107); + svfloat32_t v203 = svadd_f32_x(svptrue_b32(), v89, v161); + svfloat32_t v166 = svadd_f32_x(svptrue_b32(), v162, v164); + svfloat32_t v167 = svadd_f32_x(svptrue_b32(), v163, v165); + svfloat32_t v172 = svsub_f32_x(svptrue_b32(), v162, v164); + svfloat32_t v173 = svsub_f32_x(svptrue_b32(), v163, v165); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v169, v171); + svfloat32_t v177 = svadd_f32_x(svptrue_b32(), v168, v170); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v170, v171); + svfloat32_t v180 = svsub_f32_x(svptrue_b32(), v168, v169); + svfloat32_t v190 = svadd_f32_x(svptrue_b32(), v182, v183); + svfloat32_t v191 = svadd_f32_x(svptrue_b32(), v186, v187); + svfloat32_t v193 = svsub_f32_x(svptrue_b32(), v182, v183); + svfloat32_t v194 = svsub_f32_x(svptrue_b32(), v186, v187); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v184, v185); + svfloat32_t v197 = svadd_f32_x(svptrue_b32(), v188, v189); + svfloat32_t v199 = svsub_f32_x(svptrue_b32(), v184, v185); + svfloat32_t v200 = svsub_f32_x(svptrue_b32(), v188, v189); + svfloat32_t v240 = svmul_f32_x(svptrue_b32(), v170, v826); + svfloat32_t zero407 = svdup_n_f32(0); + svfloat32_t v407 = svcmla_f32_x(pred_full, zero407, v853, v203, 90); + svfloat32_t v174 = svadd_f32_x(svptrue_b32(), v166, v167); + svfloat32_t v175 = svsub_f32_x(svptrue_b32(), v166, v167); + svfloat32_t v178 = svsub_f32_x(svptrue_b32(), v177, v176); + svfloat32_t v181 = svadd_f32_x(svptrue_b32(), v172, v173); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v190, v191); + svfloat32_t v195 = svadd_f32_x(svptrue_b32(), v193, v194); + svfloat32_t v198 = svadd_f32_x(svptrue_b32(), v196, v197); + svfloat32_t v201 = svadd_f32_x(svptrue_b32(), v199, v200); + svfloat32_t v204 = svsub_f32_x(svptrue_b32(), v197, v191); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v190, v196); + svfloat32_t v250 = svmul_f32_x(svptrue_b32(), v172, v828); + svfloat32_t v255 = svmul_f32_x(svptrue_b32(), v173, v829); + svfloat32_t v285 = svmul_f32_x(svptrue_b32(), v179, v835); + svfloat32_t v290 = svmul_f32_x(svptrue_b32(), v180, v836); + svfloat32_t v205 = svadd_f32_x(svptrue_b32(), v204, v35); + svfloat32_t v208 = svadd_f32_x(svptrue_b32(), v207, v89); + svfloat32_t v220 = svadd_f32_x(svptrue_b32(), v219, v174); + svfloat32_t v280 = svmul_f32_x(svptrue_b32(), v178, v834); + svfloat32_t zero316 = svdup_n_f32(0); + svfloat32_t v316 = svcmla_f32_x(pred_full, zero316, v840, v192, 90); + svfloat32_t zero337 = svdup_n_f32(0); + svfloat32_t v337 = svcmla_f32_x(pred_full, zero337, v843, v195, 90); + svfloat32_t zero358 = svdup_n_f32(0); + svfloat32_t v358 = svcmla_f32_x(pred_full, zero358, v846, v198, 90); + svfloat32_t zero379 = svdup_n_f32(0); + svfloat32_t v379 = svcmla_f32_x(pred_full, zero379, v849, v201, 90); + svfloat32_t v445 = svmla_f32_x(pred_full, v285, v171, v827); + svfloat32_t v446 = svnmls_f32_x(pred_full, v240, v179, v835); + svfloat32_t v447 = svmla_f32_x(pred_full, v290, v169, v825); + svfloat32_t v448 = svnmls_f32_x(pred_full, v290, v168, v824); + svfloat32_t v206 = svsub_f32_x(svptrue_b32(), v205, v203); + svfloat32_t v209 = svadd_f32_x(svptrue_b32(), v208, v107); + svfloat32_t v443 = svmla_f32_x(pred_full, v280, v176, v832); + svfloat32_t v444 = svnmls_f32_x(pred_full, v280, v177, v833); + svfloat32_t v449 = svnmls_f32_x(pred_full, v255, v181, v837); + svfloat32_t v450 = svmla_f32_x(pred_full, v250, v181, v837); + svfloat32_t v451 = svmla_f32_x(pred_full, v220, v174, v830); + svfloat32_t v470 = svcmla_f32_x(pred_full, v316, v838, v190, 90); + svfloat32_t v471 = svcmla_f32_x(pred_full, v316, v839, v191, 90); + svfloat32_t v472 = svcmla_f32_x(pred_full, v337, v841, v193, 90); + svfloat32_t v473 = svcmla_f32_x(pred_full, v337, v842, v194, 90); + svfloat32_t v474 = svcmla_f32_x(pred_full, v358, v844, v196, 90); + svfloat32_t v475 = svcmla_f32_x(pred_full, v358, v845, v197, 90); + svfloat32_t v476 = svcmla_f32_x(pred_full, v379, v847, v199, 90); + svfloat32_t v477 = svcmla_f32_x(pred_full, v379, v848, v200, 90); + svint16_t v519 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v220, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v210 = svsub_f32_x(svptrue_b32(), v209, v161); + svfloat32_t zero428 = svdup_n_f32(0); + svfloat32_t v428 = svcmla_f32_x(pred_full, zero428, v856, v206, 90); + svfloat32_t v452 = svmla_f32_x(pred_full, v451, v175, v831); + svfloat32_t v453 = svmls_f32_x(pred_full, v451, v175, v831); + svfloat32_t v454 = svsub_f32_x(svptrue_b32(), v443, v445); + svfloat32_t v456 = svadd_f32_x(svptrue_b32(), v444, v446); + svfloat32_t v458 = svadd_f32_x(svptrue_b32(), v443, v447); + svfloat32_t v460 = svadd_f32_x(svptrue_b32(), v444, v448); + svfloat32_t v481 = svadd_f32_x(svptrue_b32(), v470, v472); + svfloat32_t v482 = svsub_f32_x(svptrue_b32(), v470, v472); + svfloat32_t v483 = svadd_f32_x(svptrue_b32(), v471, v473); + svfloat32_t v484 = svsub_f32_x(svptrue_b32(), v471, v473); + svfloat32_t v485 = svadd_f32_x(svptrue_b32(), v474, v476); + svfloat32_t v486 = svsub_f32_x(svptrue_b32(), v476, v474); + svfloat32_t v487 = svadd_f32_x(svptrue_b32(), v475, v477); + svfloat32_t v488 = svsub_f32_x(svptrue_b32(), v477, v475); + svst1w_u64(pred_full, (unsigned *)(v866), svreinterpret_u64_s16(v519)); + svfloat32_t v211 = svadd_f32_x(svptrue_b32(), v206, v210); + svfloat32_t zero435 = svdup_n_f32(0); + svfloat32_t v435 = svcmla_f32_x(pred_full, zero435, v857, v210, 90); + svfloat32_t v455 = svadd_f32_x(svptrue_b32(), v449, v452); + svfloat32_t v457 = svadd_f32_x(svptrue_b32(), v450, v453); + svfloat32_t v459 = svsub_f32_x(svptrue_b32(), v452, v449); + svfloat32_t v461 = svsub_f32_x(svptrue_b32(), v453, v450); + svfloat32_t v498 = svadd_f32_x(svptrue_b32(), v483, v487); + svfloat32_t v500 = svadd_f32_x(svptrue_b32(), v482, v488); + svfloat32_t v502 = svsub_f32_x(svptrue_b32(), v481, v485); + svfloat32_t v504 = svsub_f32_x(svptrue_b32(), v488, v482); + svfloat32_t v506 = svadd_f32_x(svptrue_b32(), v481, v485); + svfloat32_t v509 = svsub_f32_x(svptrue_b32(), v486, v484); + svfloat32_t v512 = svsub_f32_x(svptrue_b32(), v487, v483); + svfloat32_t v515 = svadd_f32_x(svptrue_b32(), v484, v486); + svfloat32_t v462 = svadd_f32_x(svptrue_b32(), v454, v455); + svfloat32_t v463 = svadd_f32_x(svptrue_b32(), v456, v457); + svfloat32_t v464 = svadd_f32_x(svptrue_b32(), v458, v459); + svfloat32_t v465 = svadd_f32_x(svptrue_b32(), v460, v461); + svfloat32_t v466 = svsub_f32_x(svptrue_b32(), v455, v454); + svfloat32_t v467 = svsub_f32_x(svptrue_b32(), v457, v456); + svfloat32_t v468 = svsub_f32_x(svptrue_b32(), v459, v458); + svfloat32_t v469 = svsub_f32_x(svptrue_b32(), v461, v460); + svfloat32_t v489 = svsub_f32_x(svptrue_b32(), v428, v435); + svfloat32_t v478 = svcmla_f32_x(pred_full, v435, v858, v211, 90); + svfloat32_t v491 = svadd_f32_x(svptrue_b32(), v489, v489); + svfloat32_t v516 = svsub_f32_x(svptrue_b32(), v515, v489); + svfloat32_t v479 = svcmla_f32_x(pred_full, v478, v850, v202, 90); + svfloat32_t v492 = svsub_f32_x(svptrue_b32(), v407, v491); + svfloat32_t v495 = svadd_f32_x(svptrue_b32(), v478, v478); + svfloat32_t v513 = svadd_f32_x(svptrue_b32(), v512, v491); + svfloat32_t v561 = svadd_f32_x(svptrue_b32(), v469, v516); + svfloat32_t v570 = svsub_f32_x(svptrue_b32(), v469, v516); + svfloat32_t v480 = svcmla_f32_x(pred_full, v479, v851, v35, 90); + svfloat32_t v490 = svcmla_f32_x(pred_full, v479, v852, v107, 90); + svfloat32_t v493 = svcmla_f32_x(pred_full, v492, v854, v89, 90); + svfloat32_t v494 = svcmla_f32_x(pred_full, v492, v855, v161, 90); + svfloat32_t v496 = svadd_f32_x(svptrue_b32(), v495, v495); + svfloat32_t v497 = svadd_f32_x(svptrue_b32(), v489, v495); + svfloat32_t v503 = svadd_f32_x(svptrue_b32(), v502, v495); + svfloat32_t v514 = svadd_f32_x(svptrue_b32(), v513, v495); + svint16_t v564 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v561, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v573 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v570, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v499 = svadd_f32_x(svptrue_b32(), v498, v490); + svfloat32_t v501 = svadd_f32_x(svptrue_b32(), v500, v493); + svfloat32_t v505 = svsub_f32_x(svptrue_b32(), v504, v497); + svfloat32_t v507 = svadd_f32_x(svptrue_b32(), v506, v480); + svfloat32_t v510 = svsub_f32_x(svptrue_b32(), v509, v494); + svfloat32_t v543 = svadd_f32_x(svptrue_b32(), v464, v503); + svfloat32_t v552 = svsub_f32_x(svptrue_b32(), v464, v503); + svfloat32_t v651 = svadd_f32_x(svptrue_b32(), v468, v514); + svfloat32_t v660 = svsub_f32_x(svptrue_b32(), v468, v514); + svst1w_u64(pred_full, (unsigned *)(v911), svreinterpret_u64_s16(v564)); + svst1w_u64(pred_full, (unsigned *)(v920), svreinterpret_u64_s16(v573)); + svfloat32_t v508 = svadd_f32_x(svptrue_b32(), v507, v489); + svfloat32_t v511 = svadd_f32_x(svptrue_b32(), v510, v496); + svfloat32_t v525 = svadd_f32_x(svptrue_b32(), v462, v499); + svfloat32_t v534 = svsub_f32_x(svptrue_b32(), v462, v499); + svint16_t v546 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v543, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v555 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v552, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v597 = svadd_f32_x(svptrue_b32(), v465, v505); + svfloat32_t v606 = svsub_f32_x(svptrue_b32(), v465, v505); + svfloat32_t v615 = svadd_f32_x(svptrue_b32(), v463, v501); + svfloat32_t v624 = svsub_f32_x(svptrue_b32(), v463, v501); + svint16_t v654 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v651, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v663 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v660, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v528 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v525, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v537 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v534, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v579 = svadd_f32_x(svptrue_b32(), v466, v508); + svfloat32_t v588 = svsub_f32_x(svptrue_b32(), v466, v508); + svint16_t v600 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v597, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v609 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v606, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v618 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v615, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v627 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v624, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v633 = svadd_f32_x(svptrue_b32(), v467, v511); + svfloat32_t v642 = svsub_f32_x(svptrue_b32(), v467, v511); + svst1w_u64(pred_full, (unsigned *)(v893), svreinterpret_u64_s16(v546)); + svst1w_u64(pred_full, (unsigned *)(v902), svreinterpret_u64_s16(v555)); + svst1w_u64(pred_full, (unsigned *)(v1001), svreinterpret_u64_s16(v654)); + svst1w_u64(pred_full, (unsigned *)(v1010), svreinterpret_u64_s16(v663)); + svint16_t v582 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v579, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v591 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v588, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v636 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v633, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v645 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v642, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v875), svreinterpret_u64_s16(v528)); + svst1w_u64(pred_full, (unsigned *)(v884), svreinterpret_u64_s16(v537)); + svst1w_u64(pred_full, (unsigned *)(v947), svreinterpret_u64_s16(v600)); + svst1w_u64(pred_full, (unsigned *)(v956), svreinterpret_u64_s16(v609)); + svst1w_u64(pred_full, (unsigned *)(v965), svreinterpret_u64_s16(v618)); + svst1w_u64(pred_full, (unsigned *)(v974), svreinterpret_u64_s16(v627)); + svst1w_u64(pred_full, (unsigned *)(v929), svreinterpret_u64_s16(v582)); + svst1w_u64(pred_full, (unsigned *)(v938), svreinterpret_u64_s16(v591)); + svst1w_u64(pred_full, (unsigned *)(v983), svreinterpret_u64_s16(v636)); + svst1w_u64(pred_full, (unsigned *)(v992), svreinterpret_u64_s16(v645)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu18(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v96 = vld1s_s16(&v5[istride]); + float v271 = -5.0000000000000000e-01F; + float v282 = -1.4999999999999998e+00F; + float v285 = 8.6602540378443871e-01F; + float v286 = -8.6602540378443871e-01F; + float v293 = 7.6604444311897801e-01F; + float v297 = 9.3969262078590832e-01F; + float v301 = -1.7364817766693039e-01F; + float v304 = 6.4278760968653925e-01F; + float v305 = -6.4278760968653925e-01F; + float v311 = -3.4202014332566888e-01F; + float v312 = 3.4202014332566888e-01F; + float v318 = 9.8480775301220802e-01F; + float v319 = -9.8480775301220802e-01F; + float32x2_t v321 = (float32x2_t){v4, v4}; + int16x4_t v20 = vld1s_s16(&v5[0]); + float32x2_t v97 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v96)), 15); + float32x2_t v272 = (float32x2_t){v271, v271}; + float32x2_t v283 = (float32x2_t){v282, v282}; + float32x2_t v287 = (float32x2_t){v285, v286}; + float32x2_t v294 = (float32x2_t){v293, v293}; + float32x2_t v298 = (float32x2_t){v297, v297}; + float32x2_t v302 = (float32x2_t){v301, v301}; + float32x2_t v306 = (float32x2_t){v304, v305}; + float32x2_t v313 = (float32x2_t){v311, v312}; + float32x2_t v320 = (float32x2_t){v318, v319}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v26 = vld1s_s16(&v5[istride * 9]); + int16x4_t v34 = vld1s_s16(&v5[istride * 2]); + int16x4_t v40 = vld1s_s16(&v5[istride * 11]); + int16x4_t v48 = vld1s_s16(&v5[istride * 4]); + int16x4_t v54 = vld1s_s16(&v5[istride * 13]); + int16x4_t v62 = vld1s_s16(&v5[istride * 6]); + int16x4_t v68 = vld1s_s16(&v5[istride * 15]); + int16x4_t v76 = vld1s_s16(&v5[istride * 8]); + int16x4_t v82 = vld1s_s16(&v5[istride * 17]); + int16x4_t v90 = vld1s_s16(&v5[istride * 10]); + int16x4_t v104 = vld1s_s16(&v5[istride * 12]); + int16x4_t v110 = vld1s_s16(&v5[istride * 3]); + int16x4_t v118 = vld1s_s16(&v5[istride * 14]); + int16x4_t v124 = vld1s_s16(&v5[istride * 5]); + int16x4_t v132 = vld1s_s16(&v5[istride * 16]); + int16x4_t v138 = vld1s_s16(&v5[istride * 7]); + float32x2_t v289 = vmul_f32(v321, v287); + float32x2_t v308 = vmul_f32(v321, v306); + float32x2_t v315 = vmul_f32(v321, v313); + float32x2_t v322 = vmul_f32(v321, v320); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v49 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v48)), 15); + float32x2_t v55 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v54)), 15); + float32x2_t v63 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v62)), 15); + float32x2_t v69 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v68)), 15); + float32x2_t v77 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v76)), 15); + float32x2_t v83 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v82)), 15); + float32x2_t v91 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v90)), 15); + float32x2_t v105 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v104)), 15); + float32x2_t v111 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v110)), 15); + float32x2_t v119 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v118)), 15); + float32x2_t v125 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v124)), 15); + float32x2_t v133 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v132)), 15); + float32x2_t v139 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v138)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v56 = vadd_f32(v49, v55); + float32x2_t v57 = vsub_f32(v49, v55); + float32x2_t v70 = vadd_f32(v63, v69); + float32x2_t v71 = vsub_f32(v63, v69); + float32x2_t v84 = vadd_f32(v77, v83); + float32x2_t v85 = vsub_f32(v77, v83); + float32x2_t v98 = vadd_f32(v91, v97); + float32x2_t v99 = vsub_f32(v91, v97); + float32x2_t v112 = vadd_f32(v105, v111); + float32x2_t v113 = vsub_f32(v105, v111); + float32x2_t v126 = vadd_f32(v119, v125); + float32x2_t v127 = vsub_f32(v119, v125); + float32x2_t v140 = vadd_f32(v133, v139); + float32x2_t v141 = vsub_f32(v133, v139); + float32x2_t v142 = vadd_f32(v42, v140); + float32x2_t v143 = vsub_f32(v42, v140); + float32x2_t v144 = vadd_f32(v126, v56); + float32x2_t v145 = vsub_f32(v126, v56); + float32x2_t v146 = vadd_f32(v70, v112); + float32x2_t v147 = vsub_f32(v70, v112); + float32x2_t v148 = vadd_f32(v84, v98); + float32x2_t v149 = vsub_f32(v84, v98); + float32x2_t v246 = vadd_f32(v43, v141); + float32x2_t v247 = vsub_f32(v43, v141); + float32x2_t v248 = vadd_f32(v127, v57); + float32x2_t v249 = vsub_f32(v127, v57); + float32x2_t v250 = vadd_f32(v71, v113); + float32x2_t v251 = vsub_f32(v71, v113); + float32x2_t v252 = vadd_f32(v85, v99); + float32x2_t v253 = vsub_f32(v85, v99); + float32x2_t v150 = vadd_f32(v142, v144); + float32x2_t v154 = vadd_f32(v143, v145); + float32x2_t v156 = vsub_f32(v142, v144); + float32x2_t v157 = vsub_f32(v144, v148); + float32x2_t v158 = vsub_f32(v148, v142); + float32x2_t v159 = vsub_f32(v143, v145); + float32x2_t v160 = vsub_f32(v145, v149); + float32x2_t v161 = vsub_f32(v149, v143); + float32x2_t v180 = vmul_f32(v146, v283); + float32x2_t v186 = vrev64_f32(v147); + float32x2_t v254 = vadd_f32(v246, v248); + float32x2_t v258 = vadd_f32(v247, v249); + float32x2_t v260 = vsub_f32(v246, v248); + float32x2_t v261 = vsub_f32(v248, v252); + float32x2_t v262 = vsub_f32(v252, v246); + float32x2_t v263 = vsub_f32(v247, v249); + float32x2_t v264 = vsub_f32(v249, v253); + float32x2_t v265 = vsub_f32(v253, v247); + float32x2_t v284 = vmul_f32(v250, v283); + float32x2_t v290 = vrev64_f32(v251); + float32x2_t v151 = vadd_f32(v150, v148); + float32x2_t v155 = vadd_f32(v154, v149); + float32x2_t v187 = vmul_f32(v186, v289); + float32x2_t v191 = vmul_f32(v156, v294); + float32x2_t v195 = vmul_f32(v157, v298); + float32x2_t v199 = vmul_f32(v158, v302); + float32x2_t v205 = vrev64_f32(v159); + float32x2_t v212 = vrev64_f32(v160); + float32x2_t v219 = vrev64_f32(v161); + float32x2_t v255 = vadd_f32(v254, v252); + float32x2_t v259 = vadd_f32(v258, v253); + float32x2_t v291 = vmul_f32(v290, v289); + float32x2_t v295 = vmul_f32(v260, v294); + float32x2_t v299 = vmul_f32(v261, v298); + float32x2_t v303 = vmul_f32(v262, v302); + float32x2_t v309 = vrev64_f32(v263); + float32x2_t v316 = vrev64_f32(v264); + float32x2_t v323 = vrev64_f32(v265); + float32x2_t v152 = vadd_f32(v151, v146); + float32x2_t v169 = vmul_f32(v151, v272); + float32x2_t v175 = vrev64_f32(v155); + float32x2_t v206 = vmul_f32(v205, v308); + float32x2_t v213 = vmul_f32(v212, v315); + float32x2_t v220 = vmul_f32(v219, v322); + float32x2_t v256 = vadd_f32(v255, v250); + float32x2_t v273 = vmul_f32(v255, v272); + float32x2_t v279 = vrev64_f32(v259); + float32x2_t v310 = vmul_f32(v309, v308); + float32x2_t v317 = vmul_f32(v316, v315); + float32x2_t v324 = vmul_f32(v323, v322); + float32x2_t v153 = vadd_f32(v152, v28); + float32x2_t v176 = vmul_f32(v175, v289); + float32x2_t v221 = vadd_f32(v169, v169); + float32x2_t v234 = vadd_f32(v187, v206); + float32x2_t v236 = vsub_f32(v187, v213); + float32x2_t v238 = vsub_f32(v187, v206); + float32x2_t v257 = vadd_f32(v256, v29); + float32x2_t v280 = vmul_f32(v279, v289); + float32x2_t v325 = vadd_f32(v273, v273); + float32x2_t v338 = vadd_f32(v291, v310); + float32x2_t v340 = vsub_f32(v291, v317); + float32x2_t v342 = vsub_f32(v291, v310); + float32x2_t v222 = vadd_f32(v221, v169); + float32x2_t v226 = vadd_f32(v153, v180); + float32x2_t v235 = vadd_f32(v234, v213); + float32x2_t v237 = vadd_f32(v236, v220); + float32x2_t v239 = vsub_f32(v238, v220); + float32x2_t v326 = vadd_f32(v325, v273); + float32x2_t v330 = vadd_f32(v257, v284); + float32x2_t v339 = vadd_f32(v338, v317); + float32x2_t v341 = vadd_f32(v340, v324); + float32x2_t v343 = vsub_f32(v342, v324); + int16x4_t v352 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v153, 15), (int32x2_t){0, 0})); + int16x4_t v358 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v257, 15), (int32x2_t){0, 0})); + float32x2_t v223 = vadd_f32(v153, v222); + float32x2_t v227 = vadd_f32(v226, v221); + float32x2_t v327 = vadd_f32(v257, v326); + float32x2_t v331 = vadd_f32(v330, v325); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v352), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v358), 0); + float32x2_t v224 = vadd_f32(v223, v176); + float32x2_t v225 = vsub_f32(v223, v176); + float32x2_t v228 = vadd_f32(v227, v191); + float32x2_t v230 = vsub_f32(v227, v195); + float32x2_t v232 = vsub_f32(v227, v191); + float32x2_t v328 = vadd_f32(v327, v280); + float32x2_t v329 = vsub_f32(v327, v280); + float32x2_t v332 = vadd_f32(v331, v295); + float32x2_t v334 = vsub_f32(v331, v299); + float32x2_t v336 = vsub_f32(v331, v295); + float32x2_t v229 = vadd_f32(v228, v195); + float32x2_t v231 = vadd_f32(v230, v199); + float32x2_t v233 = vsub_f32(v232, v199); + float32x2_t v333 = vadd_f32(v332, v299); + float32x2_t v335 = vadd_f32(v334, v303); + float32x2_t v337 = vsub_f32(v336, v303); + int16x4_t v388 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v225, 15), (int32x2_t){0, 0})); + int16x4_t v394 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v329, 15), (int32x2_t){0, 0})); + int16x4_t v424 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v224, 15), (int32x2_t){0, 0})); + int16x4_t v430 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v328, 15), (int32x2_t){0, 0})); + float32x2_t v240 = vadd_f32(v229, v235); + float32x2_t v241 = vsub_f32(v229, v235); + float32x2_t v242 = vadd_f32(v231, v237); + float32x2_t v243 = vsub_f32(v231, v237); + float32x2_t v244 = vadd_f32(v233, v239); + float32x2_t v245 = vsub_f32(v233, v239); + float32x2_t v344 = vadd_f32(v333, v339); + float32x2_t v345 = vsub_f32(v333, v339); + float32x2_t v346 = vadd_f32(v335, v341); + float32x2_t v347 = vsub_f32(v335, v341); + float32x2_t v348 = vadd_f32(v337, v343); + float32x2_t v349 = vsub_f32(v337, v343); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v388), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v394), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v424), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v430), 0); + int16x4_t v364 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v241, 15), (int32x2_t){0, 0})); + int16x4_t v370 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v345, 15), (int32x2_t){0, 0})); + int16x4_t v376 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v242, 15), (int32x2_t){0, 0})); + int16x4_t v382 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v346, 15), (int32x2_t){0, 0})); + int16x4_t v400 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v245, 15), (int32x2_t){0, 0})); + int16x4_t v406 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v349, 15), (int32x2_t){0, 0})); + int16x4_t v412 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v244, 15), (int32x2_t){0, 0})); + int16x4_t v418 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v348, 15), (int32x2_t){0, 0})); + int16x4_t v436 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v243, 15), (int32x2_t){0, 0})); + int16x4_t v442 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v347, 15), (int32x2_t){0, 0})); + int16x4_t v448 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v240, 15), (int32x2_t){0, 0})); + int16x4_t v454 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v344, 15), (int32x2_t){0, 0})); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v364), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v370), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v376), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v382), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v400), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v406), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v412), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v418), 0); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v436), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v442), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v448), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v454), 0); + v5 += 1 * 1; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu18(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * 1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v316 = -5.0000000000000000e-01F; + float v328 = -1.4999999999999998e+00F; + float v333 = -8.6602540378443871e-01F; + float v340 = 7.6604444311897801e-01F; + float v345 = 9.3969262078590832e-01F; + float v350 = -1.7364817766693039e-01F; + float v355 = -6.4278760968653925e-01F; + float v362 = 3.4202014332566888e-01F; + float v369 = -9.8480775301220802e-01F; + const int32_t *v650 = &v5[v0]; + int32_t *v763 = &v6[v2]; + int64_t v27 = v0 * 9; + int64_t v37 = v0 * 2; + int64_t v45 = v0 * 11; + int64_t v55 = v0 * 4; + int64_t v63 = v0 * 13; + int64_t v73 = v0 * 6; + int64_t v81 = v0 * 15; + int64_t v91 = v0 * 8; + int64_t v99 = v0 * 17; + int64_t v109 = v0 * 10; + int64_t v127 = v0 * 12; + int64_t v135 = v0 * 3; + int64_t v145 = v0 * 14; + int64_t v153 = v0 * 5; + int64_t v163 = v0 * 16; + int64_t v171 = v0 * 7; + float v336 = v4 * v333; + float v358 = v4 * v355; + float v365 = v4 * v362; + float v372 = v4 * v369; + int64_t v409 = v2 * 9; + int64_t v417 = v2 * 10; + int64_t v433 = v2 * 2; + int64_t v441 = v2 * 11; + int64_t v449 = v2 * 12; + int64_t v457 = v2 * 3; + int64_t v465 = v2 * 4; + int64_t v473 = v2 * 13; + int64_t v481 = v2 * 14; + int64_t v489 = v2 * 5; + int64_t v497 = v2 * 6; + int64_t v505 = v2 * 15; + int64_t v513 = v2 * 16; + int64_t v521 = v2 * 7; + int64_t v529 = v2 * 8; + int64_t v537 = v2 * 17; + const int32_t *v551 = &v5[0]; + svfloat32_t v719 = svdup_n_f32(v316); + svfloat32_t v721 = svdup_n_f32(v328); + svfloat32_t v723 = svdup_n_f32(v340); + svfloat32_t v724 = svdup_n_f32(v345); + svfloat32_t v725 = svdup_n_f32(v350); + int32_t *v736 = &v6[0]; + svfloat32_t v123 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v650[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v560 = &v5[v27]; + const int32_t *v569 = &v5[v37]; + const int32_t *v578 = &v5[v45]; + const int32_t *v587 = &v5[v55]; + const int32_t *v596 = &v5[v63]; + const int32_t *v605 = &v5[v73]; + const int32_t *v614 = &v5[v81]; + const int32_t *v623 = &v5[v91]; + const int32_t *v632 = &v5[v99]; + const int32_t *v641 = &v5[v109]; + const int32_t *v659 = &v5[v127]; + const int32_t *v668 = &v5[v135]; + const int32_t *v677 = &v5[v145]; + const int32_t *v686 = &v5[v153]; + const int32_t *v695 = &v5[v163]; + const int32_t *v704 = &v5[v171]; + svfloat32_t v722 = svdup_n_f32(v336); + svfloat32_t v726 = svdup_n_f32(v358); + svfloat32_t v727 = svdup_n_f32(v365); + svfloat32_t v728 = svdup_n_f32(v372); + int32_t *v745 = &v6[v409]; + int32_t *v754 = &v6[v417]; + int32_t *v772 = &v6[v433]; + int32_t *v781 = &v6[v441]; + int32_t *v790 = &v6[v449]; + int32_t *v799 = &v6[v457]; + int32_t *v808 = &v6[v465]; + int32_t *v817 = &v6[v473]; + int32_t *v826 = &v6[v481]; + int32_t *v835 = &v6[v489]; + int32_t *v844 = &v6[v497]; + int32_t *v853 = &v6[v505]; + int32_t *v862 = &v6[v513]; + int32_t *v871 = &v6[v521]; + int32_t *v880 = &v6[v529]; + int32_t *v889 = &v6[v537]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v551[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v560[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v569[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v51 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v578[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v61 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v587[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v69 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v596[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v79 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v605[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v87 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v614[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v97 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v623[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v105 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v632[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v115 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v641[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v133 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v659[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v141 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v668[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v151 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v677[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v159 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v686[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v169 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v695[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v177 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v704[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v70 = svadd_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v71 = svsub_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v106 = svadd_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v151, v159); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v151, v159); + svfloat32_t v178 = svadd_f32_x(svptrue_b32(), v169, v177); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v169, v177); + svfloat32_t v180 = svadd_f32_x(svptrue_b32(), v52, v178); + svfloat32_t v181 = svsub_f32_x(svptrue_b32(), v52, v178); + svfloat32_t v182 = svadd_f32_x(svptrue_b32(), v160, v70); + svfloat32_t v183 = svsub_f32_x(svptrue_b32(), v160, v70); + svfloat32_t v184 = svadd_f32_x(svptrue_b32(), v88, v142); + svfloat32_t v185 = svsub_f32_x(svptrue_b32(), v88, v142); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v106, v124); + svfloat32_t v187 = svsub_f32_x(svptrue_b32(), v106, v124); + svfloat32_t v290 = svadd_f32_x(svptrue_b32(), v53, v179); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v53, v179); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v161, v71); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v161, v71); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v89, v143); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v89, v143); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v107, v125); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v107, v125); + svfloat32_t v188 = svadd_f32_x(svptrue_b32(), v180, v182); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v181, v183); + svfloat32_t v194 = svsub_f32_x(svptrue_b32(), v180, v182); + svfloat32_t v195 = svsub_f32_x(svptrue_b32(), v182, v186); + svfloat32_t v196 = svsub_f32_x(svptrue_b32(), v186, v180); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v181, v183); + svfloat32_t v198 = svsub_f32_x(svptrue_b32(), v183, v187); + svfloat32_t v199 = svsub_f32_x(svptrue_b32(), v187, v181); + svfloat32_t zero228 = svdup_n_f32(0); + svfloat32_t v228 = svcmla_f32_x(pred_full, zero228, v722, v185, 90); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v290, v292); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v291, v293); + svfloat32_t v304 = svsub_f32_x(svptrue_b32(), v290, v292); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v292, v296); + svfloat32_t v306 = svsub_f32_x(svptrue_b32(), v296, v290); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v291, v293); + svfloat32_t v308 = svsub_f32_x(svptrue_b32(), v293, v297); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v297, v291); + svfloat32_t zero338 = svdup_n_f32(0); + svfloat32_t v338 = svcmla_f32_x(pred_full, zero338, v722, v295, 90); + svfloat32_t v189 = svadd_f32_x(svptrue_b32(), v188, v186); + svfloat32_t v193 = svadd_f32_x(svptrue_b32(), v192, v187); + svfloat32_t zero250 = svdup_n_f32(0); + svfloat32_t v250 = svcmla_f32_x(pred_full, zero250, v726, v197, 90); + svfloat32_t zero257 = svdup_n_f32(0); + svfloat32_t v257 = svcmla_f32_x(pred_full, zero257, v727, v198, 90); + svfloat32_t zero264 = svdup_n_f32(0); + svfloat32_t v264 = svcmla_f32_x(pred_full, zero264, v728, v199, 90); + svfloat32_t v299 = svadd_f32_x(svptrue_b32(), v298, v296); + svfloat32_t v303 = svadd_f32_x(svptrue_b32(), v302, v297); + svfloat32_t zero360 = svdup_n_f32(0); + svfloat32_t v360 = svcmla_f32_x(pred_full, zero360, v726, v307, 90); + svfloat32_t zero367 = svdup_n_f32(0); + svfloat32_t v367 = svcmla_f32_x(pred_full, zero367, v727, v308, 90); + svfloat32_t zero374 = svdup_n_f32(0); + svfloat32_t v374 = svcmla_f32_x(pred_full, zero374, v728, v309, 90); + svfloat32_t v190 = svadd_f32_x(svptrue_b32(), v189, v184); + svfloat32_t v209 = svmul_f32_x(svptrue_b32(), v189, v719); + svfloat32_t zero216 = svdup_n_f32(0); + svfloat32_t v216 = svcmla_f32_x(pred_full, zero216, v722, v193, 90); + svfloat32_t v278 = svadd_f32_x(svptrue_b32(), v228, v250); + svfloat32_t v280 = svsub_f32_x(svptrue_b32(), v228, v257); + svfloat32_t v282 = svsub_f32_x(svptrue_b32(), v228, v250); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v299, v294); + svfloat32_t v319 = svmul_f32_x(svptrue_b32(), v299, v719); + svfloat32_t zero326 = svdup_n_f32(0); + svfloat32_t v326 = svcmla_f32_x(pred_full, zero326, v722, v303, 90); + svfloat32_t v388 = svadd_f32_x(svptrue_b32(), v338, v360); + svfloat32_t v390 = svsub_f32_x(svptrue_b32(), v338, v367); + svfloat32_t v392 = svsub_f32_x(svptrue_b32(), v338, v360); + svfloat32_t v191 = svadd_f32_x(svptrue_b32(), v190, v34); + svfloat32_t v265 = svadd_f32_x(svptrue_b32(), v209, v209); + svfloat32_t v279 = svadd_f32_x(svptrue_b32(), v278, v257); + svfloat32_t v281 = svadd_f32_x(svptrue_b32(), v280, v264); + svfloat32_t v283 = svsub_f32_x(svptrue_b32(), v282, v264); + svfloat32_t v301 = svadd_f32_x(svptrue_b32(), v300, v35); + svfloat32_t v375 = svadd_f32_x(svptrue_b32(), v319, v319); + svfloat32_t v389 = svadd_f32_x(svptrue_b32(), v388, v367); + svfloat32_t v391 = svadd_f32_x(svptrue_b32(), v390, v374); + svfloat32_t v393 = svsub_f32_x(svptrue_b32(), v392, v374); + svfloat32_t v266 = svmla_f32_x(pred_full, v265, v189, v719); + svfloat32_t v270 = svmla_f32_x(pred_full, v191, v184, v721); + svfloat32_t v376 = svmla_f32_x(pred_full, v375, v299, v719); + svfloat32_t v380 = svmla_f32_x(pred_full, v301, v294, v721); + svint16_t v402 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v191, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v410 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v301, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v267 = svadd_f32_x(svptrue_b32(), v191, v266); + svfloat32_t v271 = svadd_f32_x(svptrue_b32(), v270, v265); + svfloat32_t v377 = svadd_f32_x(svptrue_b32(), v301, v376); + svfloat32_t v381 = svadd_f32_x(svptrue_b32(), v380, v375); + svst1w_u64(pred_full, (unsigned *)(v736), svreinterpret_u64_s16(v402)); + svst1w_u64(pred_full, (unsigned *)(v745), svreinterpret_u64_s16(v410)); + svfloat32_t v268 = svadd_f32_x(svptrue_b32(), v267, v216); + svfloat32_t v269 = svsub_f32_x(svptrue_b32(), v267, v216); + svfloat32_t v272 = svmla_f32_x(pred_full, v271, v194, v723); + svfloat32_t v274 = svmls_f32_x(pred_full, v271, v195, v724); + svfloat32_t v276 = svmls_f32_x(pred_full, v271, v194, v723); + svfloat32_t v378 = svadd_f32_x(svptrue_b32(), v377, v326); + svfloat32_t v379 = svsub_f32_x(svptrue_b32(), v377, v326); + svfloat32_t v382 = svmla_f32_x(pred_full, v381, v304, v723); + svfloat32_t v384 = svmls_f32_x(pred_full, v381, v305, v724); + svfloat32_t v386 = svmls_f32_x(pred_full, v381, v304, v723); + svfloat32_t v273 = svmla_f32_x(pred_full, v272, v195, v724); + svfloat32_t v275 = svmla_f32_x(pred_full, v274, v196, v725); + svfloat32_t v277 = svmls_f32_x(pred_full, v276, v196, v725); + svfloat32_t v383 = svmla_f32_x(pred_full, v382, v305, v724); + svfloat32_t v385 = svmla_f32_x(pred_full, v384, v306, v725); + svfloat32_t v387 = svmls_f32_x(pred_full, v386, v306, v725); + svint16_t v450 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v269, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v458 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v379, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v498 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v268, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v506 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v378, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v273, v279); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v273, v279); + svfloat32_t v286 = svadd_f32_x(svptrue_b32(), v275, v281); + svfloat32_t v287 = svsub_f32_x(svptrue_b32(), v275, v281); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v277, v283); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v277, v283); + svfloat32_t v394 = svadd_f32_x(svptrue_b32(), v383, v389); + svfloat32_t v395 = svsub_f32_x(svptrue_b32(), v383, v389); + svfloat32_t v396 = svadd_f32_x(svptrue_b32(), v385, v391); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v385, v391); + svfloat32_t v398 = svadd_f32_x(svptrue_b32(), v387, v393); + svfloat32_t v399 = svsub_f32_x(svptrue_b32(), v387, v393); + svst1w_u64(pred_full, (unsigned *)(v790), svreinterpret_u64_s16(v450)); + svst1w_u64(pred_full, (unsigned *)(v799), svreinterpret_u64_s16(v458)); + svst1w_u64(pred_full, (unsigned *)(v844), svreinterpret_u64_s16(v498)); + svst1w_u64(pred_full, (unsigned *)(v853), svreinterpret_u64_s16(v506)); + svint16_t v418 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v285, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v426 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v395, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v434 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v286, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v442 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v396, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v466 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v289, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v474 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v399, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v482 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v288, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v490 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v398, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v514 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v287, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v522 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v397, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v530 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v284, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v538 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v394, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v754), svreinterpret_u64_s16(v418)); + svst1w_u64(pred_full, (unsigned *)(v763), svreinterpret_u64_s16(v426)); + svst1w_u64(pred_full, (unsigned *)(v772), svreinterpret_u64_s16(v434)); + svst1w_u64(pred_full, (unsigned *)(v781), svreinterpret_u64_s16(v442)); + svst1w_u64(pred_full, (unsigned *)(v808), svreinterpret_u64_s16(v466)); + svst1w_u64(pred_full, (unsigned *)(v817), svreinterpret_u64_s16(v474)); + svst1w_u64(pred_full, (unsigned *)(v826), svreinterpret_u64_s16(v482)); + svst1w_u64(pred_full, (unsigned *)(v835), svreinterpret_u64_s16(v490)); + svst1w_u64(pred_full, (unsigned *)(v862), svreinterpret_u64_s16(v514)); + svst1w_u64(pred_full, (unsigned *)(v871), svreinterpret_u64_s16(v522)); + svst1w_u64(pred_full, (unsigned *)(v880), svreinterpret_u64_s16(v530)); + svst1w_u64(pred_full, (unsigned *)(v889), svreinterpret_u64_s16(v538)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu19(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v20 = vld1s_s16(&v5[istride]); + float v218 = -1.0555555555555556e+00F; + float v222 = 1.7752228513927079e-01F; + float v226 = -1.2820077502191529e-01F; + float v230 = 4.9321510117355499e-02F; + float v234 = 5.7611011491005903e-01F; + float v238 = -7.4996449655536279e-01F; + float v242 = -1.7385438164530381e-01F; + float v246 = -2.1729997561977314e+00F; + float v250 = -1.7021211726914738e+00F; + float v254 = 4.7087858350625778e-01F; + float v258 = -2.0239400846888440e+00F; + float v262 = 1.0551641201664090e-01F; + float v266 = 2.1294564967054850e+00F; + float v270 = -7.5087543897371167e-01F; + float v274 = 1.4812817695157160e-01F; + float v278 = 8.9900361592528333e-01F; + float v282 = -6.2148246772602778e-01F; + float v286 = -7.9869352098712687e-01F; + float v290 = -4.7339199623771833e-01F; + float v293 = -2.4216105241892630e-01F; + float v294 = 2.4216105241892630e-01F; + float v300 = -5.9368607967505101e-02F; + float v301 = 5.9368607967505101e-02F; + float v307 = 1.2578688255176201e-02F; + float v308 = -1.2578688255176201e-02F; + float v314 = -4.6789919712328903e-02F; + float v315 = 4.6789919712328903e-02F; + float v321 = -9.3750121913782358e-01F; + float v322 = 9.3750121913782358e-01F; + float v328 = -5.0111537043352902e-02F; + float v329 = 5.0111537043352902e-02F; + float v335 = -9.8761275618117661e-01F; + float v336 = 9.8761275618117661e-01F; + float v342 = -1.1745786501205959e+00F; + float v343 = 1.1745786501205959e+00F; + float v349 = 1.1114482296234993e+00F; + float v350 = -1.1114482296234993e+00F; + float v356 = 2.2860268797440955e+00F; + float v357 = -2.2860268797440955e+00F; + float v363 = 2.6420523257930939e-01F; + float v364 = -2.6420523257930939e-01F; + float v370 = 2.1981792779352136e+00F; + float v371 = -2.1981792779352136e+00F; + float v377 = 1.9339740453559042e+00F; + float v378 = -1.9339740453559042e+00F; + float v384 = -7.4825847091254893e-01F; + float v385 = 7.4825847091254893e-01F; + float v391 = -4.7820835642768872e-01F; + float v392 = 4.7820835642768872e-01F; + float v398 = 2.7005011448486022e-01F; + float v399 = -2.7005011448486022e-01F; + float v405 = -3.4642356159542270e-01F; + float v406 = 3.4642356159542270e-01F; + float v412 = -8.3485429360688279e-01F; + float v413 = 8.3485429360688279e-01F; + float v419 = -3.9375928506743518e-01F; + float v420 = 3.9375928506743518e-01F; + float32x2_t v422 = (float32x2_t){v4, v4}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v162 = vld1s_s16(&v5[0]); + float32x2_t v219 = (float32x2_t){v218, v218}; + float32x2_t v223 = (float32x2_t){v222, v222}; + float32x2_t v227 = (float32x2_t){v226, v226}; + float32x2_t v231 = (float32x2_t){v230, v230}; + float32x2_t v235 = (float32x2_t){v234, v234}; + float32x2_t v239 = (float32x2_t){v238, v238}; + float32x2_t v243 = (float32x2_t){v242, v242}; + float32x2_t v247 = (float32x2_t){v246, v246}; + float32x2_t v251 = (float32x2_t){v250, v250}; + float32x2_t v255 = (float32x2_t){v254, v254}; + float32x2_t v259 = (float32x2_t){v258, v258}; + float32x2_t v263 = (float32x2_t){v262, v262}; + float32x2_t v267 = (float32x2_t){v266, v266}; + float32x2_t v271 = (float32x2_t){v270, v270}; + float32x2_t v275 = (float32x2_t){v274, v274}; + float32x2_t v279 = (float32x2_t){v278, v278}; + float32x2_t v283 = (float32x2_t){v282, v282}; + float32x2_t v287 = (float32x2_t){v286, v286}; + float32x2_t v291 = (float32x2_t){v290, v290}; + float32x2_t v295 = (float32x2_t){v293, v294}; + float32x2_t v302 = (float32x2_t){v300, v301}; + float32x2_t v309 = (float32x2_t){v307, v308}; + float32x2_t v316 = (float32x2_t){v314, v315}; + float32x2_t v323 = (float32x2_t){v321, v322}; + float32x2_t v330 = (float32x2_t){v328, v329}; + float32x2_t v337 = (float32x2_t){v335, v336}; + float32x2_t v344 = (float32x2_t){v342, v343}; + float32x2_t v351 = (float32x2_t){v349, v350}; + float32x2_t v358 = (float32x2_t){v356, v357}; + float32x2_t v365 = (float32x2_t){v363, v364}; + float32x2_t v372 = (float32x2_t){v370, v371}; + float32x2_t v379 = (float32x2_t){v377, v378}; + float32x2_t v386 = (float32x2_t){v384, v385}; + float32x2_t v393 = (float32x2_t){v391, v392}; + float32x2_t v400 = (float32x2_t){v398, v399}; + float32x2_t v407 = (float32x2_t){v405, v406}; + float32x2_t v414 = (float32x2_t){v412, v413}; + float32x2_t v421 = (float32x2_t){v419, v420}; + int16x4_t v26 = vld1s_s16(&v5[istride * 18]); + int16x4_t v34 = vld1s_s16(&v5[istride * 2]); + int16x4_t v40 = vld1s_s16(&v5[istride * 17]); + int16x4_t v48 = vld1s_s16(&v5[istride * 4]); + int16x4_t v54 = vld1s_s16(&v5[istride * 15]); + int16x4_t v62 = vld1s_s16(&v5[istride * 8]); + int16x4_t v68 = vld1s_s16(&v5[istride * 11]); + int16x4_t v76 = vld1s_s16(&v5[istride * 16]); + int16x4_t v82 = vld1s_s16(&v5[istride * 3]); + int16x4_t v90 = vld1s_s16(&v5[istride * 13]); + int16x4_t v96 = vld1s_s16(&v5[istride * 6]); + int16x4_t v104 = vld1s_s16(&v5[istride * 7]); + int16x4_t v110 = vld1s_s16(&v5[istride * 12]); + int16x4_t v118 = vld1s_s16(&v5[istride * 14]); + int16x4_t v124 = vld1s_s16(&v5[istride * 5]); + int16x4_t v132 = vld1s_s16(&v5[istride * 9]); + int16x4_t v138 = vld1s_s16(&v5[istride * 10]); + float32x2_t v163 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v162)), 15); + float32x2_t v297 = vmul_f32(v422, v295); + float32x2_t v304 = vmul_f32(v422, v302); + float32x2_t v311 = vmul_f32(v422, v309); + float32x2_t v318 = vmul_f32(v422, v316); + float32x2_t v325 = vmul_f32(v422, v323); + float32x2_t v332 = vmul_f32(v422, v330); + float32x2_t v339 = vmul_f32(v422, v337); + float32x2_t v346 = vmul_f32(v422, v344); + float32x2_t v353 = vmul_f32(v422, v351); + float32x2_t v360 = vmul_f32(v422, v358); + float32x2_t v367 = vmul_f32(v422, v365); + float32x2_t v374 = vmul_f32(v422, v372); + float32x2_t v381 = vmul_f32(v422, v379); + float32x2_t v388 = vmul_f32(v422, v386); + float32x2_t v395 = vmul_f32(v422, v393); + float32x2_t v402 = vmul_f32(v422, v400); + float32x2_t v409 = vmul_f32(v422, v407); + float32x2_t v416 = vmul_f32(v422, v414); + float32x2_t v423 = vmul_f32(v422, v421); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v49 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v48)), 15); + float32x2_t v55 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v54)), 15); + float32x2_t v63 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v62)), 15); + float32x2_t v69 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v68)), 15); + float32x2_t v77 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v76)), 15); + float32x2_t v83 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v82)), 15); + float32x2_t v91 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v90)), 15); + float32x2_t v97 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v96)), 15); + float32x2_t v105 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v104)), 15); + float32x2_t v111 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v110)), 15); + float32x2_t v119 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v118)), 15); + float32x2_t v125 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v124)), 15); + float32x2_t v133 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v132)), 15); + float32x2_t v139 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v138)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v41, v35); + float32x2_t v56 = vadd_f32(v49, v55); + float32x2_t v57 = vsub_f32(v49, v55); + float32x2_t v70 = vadd_f32(v63, v69); + float32x2_t v71 = vsub_f32(v69, v63); + float32x2_t v84 = vadd_f32(v77, v83); + float32x2_t v85 = vsub_f32(v77, v83); + float32x2_t v98 = vadd_f32(v91, v97); + float32x2_t v99 = vsub_f32(v97, v91); + float32x2_t v112 = vadd_f32(v105, v111); + float32x2_t v113 = vsub_f32(v105, v111); + float32x2_t v126 = vadd_f32(v119, v125); + float32x2_t v127 = vsub_f32(v125, v119); + float32x2_t v140 = vadd_f32(v133, v139); + float32x2_t v141 = vsub_f32(v133, v139); + float32x2_t v142 = vsub_f32(v28, v112); + float32x2_t v143 = vsub_f32(v42, v126); + float32x2_t v144 = vsub_f32(v56, v140); + float32x2_t v145 = vsub_f32(v70, v112); + float32x2_t v146 = vsub_f32(v84, v126); + float32x2_t v147 = vsub_f32(v98, v140); + float32x2_t v148 = vadd_f32(v28, v70); + float32x2_t v150 = vadd_f32(v42, v84); + float32x2_t v152 = vadd_f32(v56, v98); + float32x2_t v181 = vsub_f32(v29, v113); + float32x2_t v182 = vsub_f32(v43, v127); + float32x2_t v183 = vsub_f32(v57, v141); + float32x2_t v184 = vsub_f32(v71, v113); + float32x2_t v185 = vsub_f32(v85, v127); + float32x2_t v186 = vsub_f32(v99, v141); + float32x2_t v187 = vadd_f32(v29, v71); + float32x2_t v189 = vadd_f32(v43, v85); + float32x2_t v191 = vadd_f32(v57, v99); + float32x2_t v149 = vadd_f32(v148, v112); + float32x2_t v151 = vadd_f32(v150, v126); + float32x2_t v153 = vadd_f32(v152, v140); + float32x2_t v154 = vadd_f32(v142, v144); + float32x2_t v155 = vadd_f32(v145, v147); + float32x2_t v171 = vsub_f32(v142, v145); + float32x2_t v172 = vsub_f32(v144, v147); + float32x2_t v188 = vadd_f32(v187, v113); + float32x2_t v190 = vadd_f32(v189, v127); + float32x2_t v192 = vadd_f32(v191, v141); + float32x2_t v193 = vadd_f32(v181, v183); + float32x2_t v194 = vadd_f32(v184, v186); + float32x2_t v203 = vsub_f32(v181, v184); + float32x2_t v204 = vsub_f32(v183, v186); + float32x2_t v248 = vmul_f32(v145, v247); + float32x2_t v260 = vmul_f32(v147, v259); + float32x2_t v268 = vmul_f32(v144, v267); + float32x2_t v347 = vrev64_f32(v184); + float32x2_t v361 = vrev64_f32(v181); + float32x2_t v368 = vrev64_f32(v186); + float32x2_t v382 = vrev64_f32(v183); + float32x2_t v156 = vadd_f32(v149, v151); + float32x2_t v165 = vadd_f32(v155, v146); + float32x2_t v166 = vadd_f32(v154, v143); + float32x2_t v168 = vsub_f32(v155, v146); + float32x2_t v169 = vsub_f32(v154, v143); + float32x2_t v173 = vsub_f32(v142, v172); + float32x2_t v175 = vadd_f32(v171, v147); + float32x2_t v178 = vsub_f32(v149, v153); + float32x2_t v179 = vsub_f32(v151, v153); + float32x2_t v195 = vadd_f32(v188, v190); + float32x2_t v197 = vadd_f32(v194, v185); + float32x2_t v198 = vadd_f32(v193, v182); + float32x2_t v200 = vsub_f32(v194, v185); + float32x2_t v201 = vsub_f32(v193, v182); + float32x2_t v205 = vsub_f32(v181, v204); + float32x2_t v207 = vadd_f32(v203, v186); + float32x2_t v210 = vsub_f32(v188, v192); + float32x2_t v211 = vsub_f32(v190, v192); + float32x2_t v252 = vmul_f32(v171, v251); + float32x2_t v264 = vmul_f32(v172, v263); + float32x2_t v348 = vmul_f32(v347, v346); + float32x2_t v354 = vrev64_f32(v203); + float32x2_t v369 = vmul_f32(v368, v367); + float32x2_t v375 = vrev64_f32(v204); + float32x2_t v383 = vmul_f32(v382, v381); + float32x2_t v157 = vadd_f32(v156, v153); + float32x2_t v167 = vsub_f32(v166, v165); + float32x2_t v170 = vsub_f32(v169, v168); + float32x2_t v174 = vsub_f32(v173, v146); + float32x2_t v176 = vsub_f32(v175, v143); + float32x2_t v180 = vadd_f32(v178, v179); + float32x2_t v196 = vadd_f32(v195, v192); + float32x2_t v199 = vsub_f32(v198, v197); + float32x2_t v202 = vsub_f32(v201, v200); + float32x2_t v206 = vsub_f32(v205, v185); + float32x2_t v208 = vsub_f32(v207, v182); + float32x2_t v212 = vadd_f32(v210, v211); + float32x2_t v224 = vmul_f32(v165, v223); + float32x2_t v228 = vmul_f32(v166, v227); + float32x2_t v236 = vmul_f32(v168, v235); + float32x2_t v240 = vmul_f32(v169, v239); + float32x2_t v284 = vmul_f32(v178, v283); + float32x2_t v288 = vmul_f32(v179, v287); + float32x2_t v305 = vrev64_f32(v197); + float32x2_t v312 = vrev64_f32(v198); + float32x2_t v326 = vrev64_f32(v200); + float32x2_t v333 = vrev64_f32(v201); + float32x2_t v355 = vmul_f32(v354, v353); + float32x2_t v376 = vmul_f32(v375, v374); + float32x2_t v410 = vrev64_f32(v210); + float32x2_t v417 = vrev64_f32(v211); + float32x2_t v164 = vadd_f32(v163, v157); + float32x2_t v177 = vsub_f32(v174, v176); + float32x2_t v209 = vsub_f32(v206, v208); + float32x2_t v220 = vmul_f32(v157, v219); + float32x2_t v232 = vmul_f32(v167, v231); + float32x2_t v244 = vmul_f32(v170, v243); + float32x2_t v272 = vmul_f32(v174, v271); + float32x2_t v276 = vmul_f32(v176, v275); + float32x2_t v292 = vmul_f32(v180, v291); + float32x2_t v298 = vrev64_f32(v196); + float32x2_t v306 = vmul_f32(v305, v304); + float32x2_t v313 = vmul_f32(v312, v311); + float32x2_t v319 = vrev64_f32(v199); + float32x2_t v327 = vmul_f32(v326, v325); + float32x2_t v334 = vmul_f32(v333, v332); + float32x2_t v340 = vrev64_f32(v202); + float32x2_t v389 = vrev64_f32(v206); + float32x2_t v396 = vrev64_f32(v208); + float32x2_t v411 = vmul_f32(v410, v409); + float32x2_t v418 = vmul_f32(v417, v416); + float32x2_t v424 = vrev64_f32(v212); + float32x2_t v426 = vadd_f32(v224, v228); + float32x2_t v427 = vadd_f32(v236, v240); + float32x2_t v280 = vmul_f32(v177, v279); + float32x2_t v299 = vmul_f32(v298, v297); + float32x2_t v320 = vmul_f32(v319, v318); + float32x2_t v341 = vmul_f32(v340, v339); + float32x2_t v390 = vmul_f32(v389, v388); + float32x2_t v397 = vmul_f32(v396, v395); + float32x2_t v403 = vrev64_f32(v209); + float32x2_t v425 = vmul_f32(v424, v423); + float32x2_t v429 = vadd_f32(v426, v427); + float32x2_t v430 = vadd_f32(v224, v232); + float32x2_t v431 = vadd_f32(v236, v244); + float32x2_t v448 = vsub_f32(v426, v427); + float32x2_t v450 = vsub_f32(v284, v292); + float32x2_t v451 = vsub_f32(v288, v292); + float32x2_t v452 = vadd_f32(v220, v164); + float32x2_t v457 = vadd_f32(v306, v313); + float32x2_t v458 = vadd_f32(v327, v334); + int16x4_t v513 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v164, 15), (int32x2_t){0, 0})); + float32x2_t v404 = vmul_f32(v403, v402); + float32x2_t v428 = vadd_f32(v276, v280); + float32x2_t v432 = vadd_f32(v272, v280); + float32x2_t v433 = vsub_f32(v248, v429); + float32x2_t v434 = vadd_f32(v430, v431); + float32x2_t v440 = vsub_f32(v430, v431); + float32x2_t v445 = vadd_f32(v429, v268); + float32x2_t v453 = vadd_f32(v452, v450); + float32x2_t v454 = vsub_f32(v452, v450); + float32x2_t v456 = vadd_f32(v452, v451); + float32x2_t v460 = vadd_f32(v457, v458); + float32x2_t v461 = vadd_f32(v306, v320); + float32x2_t v462 = vadd_f32(v327, v341); + float32x2_t v479 = vsub_f32(v457, v458); + float32x2_t v481 = vsub_f32(v411, v425); + float32x2_t v482 = vsub_f32(v418, v425); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v513), 0); + float32x2_t v435 = vsub_f32(v260, v432); + float32x2_t v436 = vadd_f32(v252, v428); + float32x2_t v438 = vadd_f32(v434, v264); + float32x2_t v441 = vadd_f32(v440, v428); + float32x2_t v442 = vadd_f32(v433, v434); + float32x2_t v449 = vadd_f32(v448, v432); + float32x2_t v455 = vsub_f32(v454, v451); + float32x2_t v459 = vadd_f32(v397, v404); + float32x2_t v463 = vadd_f32(v390, v404); + float32x2_t v464 = vsub_f32(v348, v460); + float32x2_t v465 = vadd_f32(v461, v462); + float32x2_t v471 = vsub_f32(v461, v462); + float32x2_t v476 = vadd_f32(v460, v383); + float32x2_t v483 = vadd_f32(v299, v481); + float32x2_t v484 = vsub_f32(v299, v481); + float32x2_t v486 = vadd_f32(v299, v482); + float32x2_t v437 = vadd_f32(v436, v433); + float32x2_t v439 = vadd_f32(v438, v435); + float32x2_t v443 = vfma_f32(v442, v142, v255); + float32x2_t v446 = vadd_f32(v445, v435); + float32x2_t v466 = vsub_f32(v369, v463); + float32x2_t v467 = vadd_f32(v355, v459); + float32x2_t v469 = vadd_f32(v465, v376); + float32x2_t v472 = vadd_f32(v471, v459); + float32x2_t v473 = vadd_f32(v464, v465); + float32x2_t v480 = vadd_f32(v479, v463); + float32x2_t v485 = vsub_f32(v484, v482); + float32x2_t v491 = vsub_f32(v449, v441); + float32x2_t v495 = vsub_f32(v456, v449); + float32x2_t v498 = vadd_f32(v441, v456); + float32x2_t v444 = vadd_f32(v443, v432); + float32x2_t v447 = vadd_f32(v446, v428); + float32x2_t v468 = vadd_f32(v467, v464); + float32x2_t v470 = vadd_f32(v469, v466); + float32x2_t v474 = vfma_f32(v473, v361, v360); + float32x2_t v477 = vadd_f32(v476, v466); + float32x2_t v492 = vadd_f32(v491, v456); + float32x2_t v496 = vadd_f32(v437, v453); + float32x2_t v497 = vadd_f32(v439, v455); + float32x2_t v503 = vsub_f32(v480, v472); + float32x2_t v507 = vsub_f32(v480, v486); + float32x2_t v510 = vadd_f32(v472, v486); + float32x2_t v475 = vadd_f32(v474, v463); + float32x2_t v478 = vadd_f32(v477, v459); + float32x2_t v487 = vsub_f32(v444, v437); + float32x2_t v489 = vsub_f32(v447, v439); + float32x2_t v493 = vsub_f32(v453, v444); + float32x2_t v494 = vsub_f32(v455, v447); + float32x2_t v504 = vadd_f32(v503, v486); + float32x2_t v508 = vadd_f32(v468, v483); + float32x2_t v509 = vadd_f32(v470, v485); + float32x2_t v531 = vsub_f32(v498, v510); + float32x2_t v538 = vadd_f32(v498, v510); + float32x2_t v545 = vadd_f32(v495, v507); + float32x2_t v552 = vsub_f32(v495, v507); + float32x2_t v488 = vadd_f32(v487, v453); + float32x2_t v490 = vadd_f32(v489, v455); + float32x2_t v499 = vsub_f32(v475, v468); + float32x2_t v501 = vsub_f32(v478, v470); + float32x2_t v505 = vsub_f32(v483, v475); + float32x2_t v506 = vsub_f32(v485, v478); + int16x4_t v534 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v531, 15), (int32x2_t){0, 0})); + int16x4_t v541 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v538, 15), (int32x2_t){0, 0})); + int16x4_t v548 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v545, 15), (int32x2_t){0, 0})); + int16x4_t v555 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v552, 15), (int32x2_t){0, 0})); + float32x2_t v559 = vadd_f32(v497, v509); + float32x2_t v566 = vsub_f32(v497, v509); + float32x2_t v573 = vadd_f32(v492, v504); + float32x2_t v580 = vsub_f32(v492, v504); + float32x2_t v615 = vsub_f32(v496, v508); + float32x2_t v622 = vadd_f32(v496, v508); + float32x2_t v500 = vadd_f32(v499, v483); + float32x2_t v502 = vadd_f32(v501, v485); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v534), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v541), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v548), 0); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v555), 0); + int16x4_t v562 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v559, 15), (int32x2_t){0, 0})); + int16x4_t v569 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v566, 15), (int32x2_t){0, 0})); + int16x4_t v576 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v573, 15), (int32x2_t){0, 0})); + int16x4_t v583 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v580, 15), (int32x2_t){0, 0})); + float32x2_t v587 = vadd_f32(v494, v506); + float32x2_t v594 = vsub_f32(v494, v506); + float32x2_t v601 = vadd_f32(v493, v505); + float32x2_t v608 = vsub_f32(v493, v505); + int16x4_t v618 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v615, 15), (int32x2_t){0, 0})); + int16x4_t v625 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v622, 15), (int32x2_t){0, 0})); + float32x2_t v517 = vadd_f32(v488, v500); + float32x2_t v524 = vsub_f32(v488, v500); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v562), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v569), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v576), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v583), 0); + int16x4_t v590 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v587, 15), (int32x2_t){0, 0})); + int16x4_t v597 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v594, 15), (int32x2_t){0, 0})); + int16x4_t v604 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v601, 15), (int32x2_t){0, 0})); + int16x4_t v611 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v608, 15), (int32x2_t){0, 0})); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v618), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v625), 0); + float32x2_t v629 = vadd_f32(v490, v502); + float32x2_t v636 = vsub_f32(v490, v502); + int16x4_t v520 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v517, 15), (int32x2_t){0, 0})); + int16x4_t v527 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v524, 15), (int32x2_t){0, 0})); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v590), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v597), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v604), 0); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v611), 0); + int16x4_t v632 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v629, 15), (int32x2_t){0, 0})); + int16x4_t v639 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v636, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v520), 0); + v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v527), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v632), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v639), 0); + v5 += 1 * 1; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu19(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * 1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v259 = -1.0555555555555556e+00F; + float v264 = 1.7752228513927079e-01F; + float v269 = -1.2820077502191529e-01F; + float v274 = 4.9321510117355499e-02F; + float v279 = 5.7611011491005903e-01F; + float v284 = -7.4996449655536279e-01F; + float v289 = -1.7385438164530381e-01F; + float v294 = -2.1729997561977314e+00F; + float v299 = -1.7021211726914738e+00F; + float v304 = 4.7087858350625778e-01F; + float v309 = -2.0239400846888440e+00F; + float v314 = 1.0551641201664090e-01F; + float v319 = 2.1294564967054850e+00F; + float v324 = -7.5087543897371167e-01F; + float v329 = 1.4812817695157160e-01F; + float v334 = 8.9900361592528333e-01F; + float v339 = -6.2148246772602778e-01F; + float v344 = -7.9869352098712687e-01F; + float v349 = -4.7339199623771833e-01F; + float v354 = 2.4216105241892630e-01F; + float v361 = 5.9368607967505101e-02F; + float v368 = -1.2578688255176201e-02F; + float v375 = 4.6789919712328903e-02F; + float v382 = 9.3750121913782358e-01F; + float v389 = 5.0111537043352902e-02F; + float v396 = 9.8761275618117661e-01F; + float v403 = 1.1745786501205959e+00F; + float v410 = -1.1114482296234993e+00F; + float v417 = -2.2860268797440955e+00F; + float v424 = -2.6420523257930939e-01F; + float v431 = -2.1981792779352136e+00F; + float v438 = -1.9339740453559042e+00F; + float v445 = 7.4825847091254893e-01F; + float v452 = 4.7820835642768872e-01F; + float v459 = -2.7005011448486022e-01F; + float v466 = 3.4642356159542270e-01F; + float v473 = 8.3485429360688279e-01F; + float v480 = 3.9375928506743518e-01F; + const int32_t *v747 = &v5[v0]; + int32_t *v968 = &v6[v2]; + int64_t v27 = v0 * 18; + int64_t v37 = v0 * 2; + int64_t v45 = v0 * 17; + int64_t v55 = v0 * 4; + int64_t v63 = v0 * 15; + int64_t v73 = v0 * 8; + int64_t v81 = v0 * 11; + int64_t v91 = v0 * 16; + int64_t v99 = v0 * 3; + int64_t v109 = v0 * 13; + int64_t v117 = v0 * 6; + int64_t v127 = v0 * 7; + int64_t v135 = v0 * 12; + int64_t v145 = v0 * 14; + int64_t v153 = v0 * 5; + int64_t v163 = v0 * 9; + int64_t v171 = v0 * 10; + float v357 = v4 * v354; + float v364 = v4 * v361; + float v371 = v4 * v368; + float v378 = v4 * v375; + float v385 = v4 * v382; + float v392 = v4 * v389; + float v399 = v4 * v396; + float v406 = v4 * v403; + float v413 = v4 * v410; + float v420 = v4 * v417; + float v427 = v4 * v424; + float v434 = v4 * v431; + float v441 = v4 * v438; + float v448 = v4 * v445; + float v455 = v4 * v452; + float v462 = v4 * v459; + float v469 = v4 * v466; + float v476 = v4 * v473; + float v483 = v4 * v480; + int64_t v590 = v2 * 18; + int64_t v599 = v2 * 2; + int64_t v608 = v2 * 17; + int64_t v617 = v2 * 3; + int64_t v626 = v2 * 16; + int64_t v635 = v2 * 4; + int64_t v644 = v2 * 15; + int64_t v653 = v2 * 5; + int64_t v662 = v2 * 14; + int64_t v671 = v2 * 6; + int64_t v680 = v2 * 13; + int64_t v689 = v2 * 7; + int64_t v698 = v2 * 12; + int64_t v707 = v2 * 8; + int64_t v716 = v2 * 11; + int64_t v725 = v2 * 9; + int64_t v734 = v2 * 10; + const int32_t *v910 = &v5[0]; + svfloat32_t v914 = svdup_n_f32(v259); + svfloat32_t v915 = svdup_n_f32(v264); + svfloat32_t v916 = svdup_n_f32(v269); + svfloat32_t v917 = svdup_n_f32(v274); + svfloat32_t v918 = svdup_n_f32(v279); + svfloat32_t v919 = svdup_n_f32(v284); + svfloat32_t v920 = svdup_n_f32(v289); + svfloat32_t v921 = svdup_n_f32(v294); + svfloat32_t v922 = svdup_n_f32(v299); + svfloat32_t v923 = svdup_n_f32(v304); + svfloat32_t v924 = svdup_n_f32(v309); + svfloat32_t v925 = svdup_n_f32(v314); + svfloat32_t v926 = svdup_n_f32(v319); + svfloat32_t v927 = svdup_n_f32(v324); + svfloat32_t v928 = svdup_n_f32(v329); + svfloat32_t v929 = svdup_n_f32(v334); + svfloat32_t v930 = svdup_n_f32(v339); + svfloat32_t v931 = svdup_n_f32(v344); + svfloat32_t v932 = svdup_n_f32(v349); + int32_t *v959 = &v6[0]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v747[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v756 = &v5[v27]; + const int32_t *v765 = &v5[v37]; + const int32_t *v774 = &v5[v45]; + const int32_t *v783 = &v5[v55]; + const int32_t *v792 = &v5[v63]; + const int32_t *v801 = &v5[v73]; + const int32_t *v810 = &v5[v81]; + const int32_t *v819 = &v5[v91]; + const int32_t *v828 = &v5[v99]; + const int32_t *v837 = &v5[v109]; + const int32_t *v846 = &v5[v117]; + const int32_t *v855 = &v5[v127]; + const int32_t *v864 = &v5[v135]; + const int32_t *v873 = &v5[v145]; + const int32_t *v882 = &v5[v153]; + const int32_t *v891 = &v5[v163]; + const int32_t *v900 = &v5[v171]; + svfloat32_t v933 = svdup_n_f32(v357); + svfloat32_t v934 = svdup_n_f32(v364); + svfloat32_t v935 = svdup_n_f32(v371); + svfloat32_t v936 = svdup_n_f32(v378); + svfloat32_t v937 = svdup_n_f32(v385); + svfloat32_t v938 = svdup_n_f32(v392); + svfloat32_t v939 = svdup_n_f32(v399); + svfloat32_t v940 = svdup_n_f32(v406); + svfloat32_t v941 = svdup_n_f32(v413); + svfloat32_t v942 = svdup_n_f32(v420); + svfloat32_t v943 = svdup_n_f32(v427); + svfloat32_t v944 = svdup_n_f32(v434); + svfloat32_t v945 = svdup_n_f32(v441); + svfloat32_t v946 = svdup_n_f32(v448); + svfloat32_t v947 = svdup_n_f32(v455); + svfloat32_t v948 = svdup_n_f32(v462); + svfloat32_t v949 = svdup_n_f32(v469); + svfloat32_t v950 = svdup_n_f32(v476); + svfloat32_t v951 = svdup_n_f32(v483); + int32_t *v977 = &v6[v590]; + int32_t *v986 = &v6[v599]; + int32_t *v995 = &v6[v608]; + int32_t *v1004 = &v6[v617]; + int32_t *v1013 = &v6[v626]; + int32_t *v1022 = &v6[v635]; + int32_t *v1031 = &v6[v644]; + int32_t *v1040 = &v6[v653]; + int32_t *v1049 = &v6[v662]; + int32_t *v1058 = &v6[v671]; + int32_t *v1067 = &v6[v680]; + int32_t *v1076 = &v6[v689]; + int32_t *v1085 = &v6[v698]; + int32_t *v1094 = &v6[v707]; + int32_t *v1103 = &v6[v716]; + int32_t *v1112 = &v6[v725]; + int32_t *v1121 = &v6[v734]; + svfloat32_t v203 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v910[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v756[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v765[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v51 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v774[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v61 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v783[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v69 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v792[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v79 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v801[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v87 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v810[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v97 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v819[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v105 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v828[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v115 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v837[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v123 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v846[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v133 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v855[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v141 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v864[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v151 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v873[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v159 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v882[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v169 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v891[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v177 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v900[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v51, v43); + svfloat32_t v70 = svadd_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v71 = svsub_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v87, v79); + svfloat32_t v106 = svadd_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v123, v115); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v151, v159); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v159, v151); + svfloat32_t v178 = svadd_f32_x(svptrue_b32(), v169, v177); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v169, v177); + svfloat32_t v180 = svsub_f32_x(svptrue_b32(), v34, v142); + svfloat32_t v181 = svsub_f32_x(svptrue_b32(), v52, v160); + svfloat32_t v182 = svsub_f32_x(svptrue_b32(), v70, v178); + svfloat32_t v183 = svsub_f32_x(svptrue_b32(), v88, v142); + svfloat32_t v184 = svsub_f32_x(svptrue_b32(), v106, v160); + svfloat32_t v185 = svsub_f32_x(svptrue_b32(), v124, v178); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v34, v88); + svfloat32_t v188 = svadd_f32_x(svptrue_b32(), v52, v106); + svfloat32_t v190 = svadd_f32_x(svptrue_b32(), v70, v124); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v35, v143); + svfloat32_t v222 = svsub_f32_x(svptrue_b32(), v53, v161); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v71, v179); + svfloat32_t v224 = svsub_f32_x(svptrue_b32(), v89, v143); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v107, v161); + svfloat32_t v226 = svsub_f32_x(svptrue_b32(), v125, v179); + svfloat32_t v227 = svadd_f32_x(svptrue_b32(), v35, v89); + svfloat32_t v229 = svadd_f32_x(svptrue_b32(), v53, v107); + svfloat32_t v231 = svadd_f32_x(svptrue_b32(), v71, v125); + svfloat32_t v187 = svadd_f32_x(svptrue_b32(), v186, v142); + svfloat32_t v189 = svadd_f32_x(svptrue_b32(), v188, v160); + svfloat32_t v191 = svadd_f32_x(svptrue_b32(), v190, v178); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v180, v182); + svfloat32_t v193 = svadd_f32_x(svptrue_b32(), v183, v185); + svfloat32_t v211 = svsub_f32_x(svptrue_b32(), v180, v183); + svfloat32_t v212 = svsub_f32_x(svptrue_b32(), v182, v185); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v227, v143); + svfloat32_t v230 = svadd_f32_x(svptrue_b32(), v229, v161); + svfloat32_t v232 = svadd_f32_x(svptrue_b32(), v231, v179); + svfloat32_t v233 = svadd_f32_x(svptrue_b32(), v221, v223); + svfloat32_t v234 = svadd_f32_x(svptrue_b32(), v224, v226); + svfloat32_t v243 = svsub_f32_x(svptrue_b32(), v221, v224); + svfloat32_t v244 = svsub_f32_x(svptrue_b32(), v223, v226); + svfloat32_t zero408 = svdup_n_f32(0); + svfloat32_t v408 = svcmla_f32_x(pred_full, zero408, v940, v224, 90); + svfloat32_t zero429 = svdup_n_f32(0); + svfloat32_t v429 = svcmla_f32_x(pred_full, zero429, v943, v226, 90); + svfloat32_t v194 = svadd_f32_x(svptrue_b32(), v187, v189); + svfloat32_t v205 = svadd_f32_x(svptrue_b32(), v193, v184); + svfloat32_t v206 = svadd_f32_x(svptrue_b32(), v192, v181); + svfloat32_t v208 = svsub_f32_x(svptrue_b32(), v193, v184); + svfloat32_t v209 = svsub_f32_x(svptrue_b32(), v192, v181); + svfloat32_t v213 = svsub_f32_x(svptrue_b32(), v180, v212); + svfloat32_t v215 = svadd_f32_x(svptrue_b32(), v211, v185); + svfloat32_t v218 = svsub_f32_x(svptrue_b32(), v187, v191); + svfloat32_t v219 = svsub_f32_x(svptrue_b32(), v189, v191); + svfloat32_t v235 = svadd_f32_x(svptrue_b32(), v228, v230); + svfloat32_t v237 = svadd_f32_x(svptrue_b32(), v234, v225); + svfloat32_t v238 = svadd_f32_x(svptrue_b32(), v233, v222); + svfloat32_t v240 = svsub_f32_x(svptrue_b32(), v234, v225); + svfloat32_t v241 = svsub_f32_x(svptrue_b32(), v233, v222); + svfloat32_t v245 = svsub_f32_x(svptrue_b32(), v221, v244); + svfloat32_t v247 = svadd_f32_x(svptrue_b32(), v243, v226); + svfloat32_t v250 = svsub_f32_x(svptrue_b32(), v228, v232); + svfloat32_t v251 = svsub_f32_x(svptrue_b32(), v230, v232); + svfloat32_t v195 = svadd_f32_x(svptrue_b32(), v194, v191); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v206, v205); + svfloat32_t v210 = svsub_f32_x(svptrue_b32(), v209, v208); + svfloat32_t v214 = svsub_f32_x(svptrue_b32(), v213, v184); + svfloat32_t v216 = svsub_f32_x(svptrue_b32(), v215, v181); + svfloat32_t v220 = svadd_f32_x(svptrue_b32(), v218, v219); + svfloat32_t v236 = svadd_f32_x(svptrue_b32(), v235, v232); + svfloat32_t v239 = svsub_f32_x(svptrue_b32(), v238, v237); + svfloat32_t v242 = svsub_f32_x(svptrue_b32(), v241, v240); + svfloat32_t v246 = svsub_f32_x(svptrue_b32(), v245, v225); + svfloat32_t v248 = svsub_f32_x(svptrue_b32(), v247, v222); + svfloat32_t v252 = svadd_f32_x(svptrue_b32(), v250, v251); + svfloat32_t v272 = svmul_f32_x(svptrue_b32(), v206, v916); + svfloat32_t v287 = svmul_f32_x(svptrue_b32(), v209, v919); + svfloat32_t zero366 = svdup_n_f32(0); + svfloat32_t v366 = svcmla_f32_x(pred_full, zero366, v934, v237, 90); + svfloat32_t zero387 = svdup_n_f32(0); + svfloat32_t v387 = svcmla_f32_x(pred_full, zero387, v937, v240, 90); + svfloat32_t zero471 = svdup_n_f32(0); + svfloat32_t v471 = svcmla_f32_x(pred_full, zero471, v949, v250, 90); + svfloat32_t zero478 = svdup_n_f32(0); + svfloat32_t v478 = svcmla_f32_x(pred_full, zero478, v950, v251, 90); + svfloat32_t v204 = svadd_f32_x(svptrue_b32(), v203, v195); + svfloat32_t v217 = svsub_f32_x(svptrue_b32(), v214, v216); + svfloat32_t v249 = svsub_f32_x(svptrue_b32(), v246, v248); + svfloat32_t v277 = svmul_f32_x(svptrue_b32(), v207, v917); + svfloat32_t v292 = svmul_f32_x(svptrue_b32(), v210, v920); + svfloat32_t v352 = svmul_f32_x(svptrue_b32(), v220, v932); + svfloat32_t zero359 = svdup_n_f32(0); + svfloat32_t v359 = svcmla_f32_x(pred_full, zero359, v933, v236, 90); + svfloat32_t zero485 = svdup_n_f32(0); + svfloat32_t v485 = svcmla_f32_x(pred_full, zero485, v951, v252, 90); + svfloat32_t v486 = svmla_f32_x(pred_full, v272, v205, v915); + svfloat32_t v487 = svmla_f32_x(pred_full, v287, v208, v918); + svfloat32_t v517 = svcmla_f32_x(pred_full, v366, v935, v238, 90); + svfloat32_t v518 = svcmla_f32_x(pred_full, v387, v938, v241, 90); + svfloat32_t v337 = svmul_f32_x(svptrue_b32(), v217, v929); + svfloat32_t zero464 = svdup_n_f32(0); + svfloat32_t v464 = svcmla_f32_x(pred_full, zero464, v948, v249, 90); + svfloat32_t v489 = svadd_f32_x(svptrue_b32(), v486, v487); + svfloat32_t v490 = svmla_f32_x(pred_full, v277, v205, v915); + svfloat32_t v491 = svmla_f32_x(pred_full, v292, v208, v918); + svfloat32_t v508 = svsub_f32_x(svptrue_b32(), v486, v487); + svfloat32_t v510 = svnmls_f32_x(pred_full, v352, v218, v930); + svfloat32_t v511 = svnmls_f32_x(pred_full, v352, v219, v931); + svfloat32_t v512 = svmla_f32_x(pred_full, v204, v195, v914); + svfloat32_t v520 = svadd_f32_x(svptrue_b32(), v517, v518); + svfloat32_t v521 = svcmla_f32_x(pred_full, v366, v936, v239, 90); + svfloat32_t v522 = svcmla_f32_x(pred_full, v387, v939, v242, 90); + svfloat32_t v539 = svsub_f32_x(svptrue_b32(), v517, v518); + svfloat32_t v541 = svsub_f32_x(svptrue_b32(), v471, v485); + svfloat32_t v542 = svsub_f32_x(svptrue_b32(), v478, v485); + svint16_t v573 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v204, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v488 = svmla_f32_x(pred_full, v337, v216, v928); + svfloat32_t v492 = svmla_f32_x(pred_full, v337, v214, v927); + svfloat32_t v493 = svnmls_f32_x(pred_full, v489, v183, v921); + svfloat32_t v494 = svadd_f32_x(svptrue_b32(), v490, v491); + svfloat32_t v500 = svsub_f32_x(svptrue_b32(), v490, v491); + svfloat32_t v505 = svmla_f32_x(pred_full, v489, v182, v926); + svfloat32_t v513 = svadd_f32_x(svptrue_b32(), v512, v510); + svfloat32_t v514 = svsub_f32_x(svptrue_b32(), v512, v510); + svfloat32_t v516 = svadd_f32_x(svptrue_b32(), v512, v511); + svfloat32_t v519 = svcmla_f32_x(pred_full, v464, v947, v248, 90); + svfloat32_t v523 = svcmla_f32_x(pred_full, v464, v946, v246, 90); + svfloat32_t v524 = svsub_f32_x(svptrue_b32(), v408, v520); + svfloat32_t v525 = svadd_f32_x(svptrue_b32(), v521, v522); + svfloat32_t v531 = svsub_f32_x(svptrue_b32(), v521, v522); + svfloat32_t v536 = svcmla_f32_x(pred_full, v520, v945, v223, 90); + svfloat32_t v543 = svadd_f32_x(svptrue_b32(), v359, v541); + svfloat32_t v544 = svsub_f32_x(svptrue_b32(), v359, v541); + svfloat32_t v546 = svadd_f32_x(svptrue_b32(), v359, v542); + svst1w_u64(pred_full, (unsigned *)(v959), svreinterpret_u64_s16(v573)); + svfloat32_t v495 = svnmls_f32_x(pred_full, v492, v185, v924); + svfloat32_t v496 = svmla_f32_x(pred_full, v488, v211, v922); + svfloat32_t v498 = svmla_f32_x(pred_full, v494, v212, v925); + svfloat32_t v501 = svadd_f32_x(svptrue_b32(), v500, v488); + svfloat32_t v502 = svadd_f32_x(svptrue_b32(), v493, v494); + svfloat32_t v509 = svadd_f32_x(svptrue_b32(), v508, v492); + svfloat32_t v515 = svsub_f32_x(svptrue_b32(), v514, v511); + svfloat32_t v526 = svsub_f32_x(svptrue_b32(), v429, v523); + svfloat32_t v527 = svcmla_f32_x(pred_full, v519, v941, v243, 90); + svfloat32_t v529 = svcmla_f32_x(pred_full, v525, v944, v244, 90); + svfloat32_t v532 = svadd_f32_x(svptrue_b32(), v531, v519); + svfloat32_t v533 = svadd_f32_x(svptrue_b32(), v524, v525); + svfloat32_t v540 = svadd_f32_x(svptrue_b32(), v539, v523); + svfloat32_t v545 = svsub_f32_x(svptrue_b32(), v544, v542); + svfloat32_t v497 = svadd_f32_x(svptrue_b32(), v496, v493); + svfloat32_t v499 = svadd_f32_x(svptrue_b32(), v498, v495); + svfloat32_t v503 = svmla_f32_x(pred_full, v502, v180, v923); + svfloat32_t v506 = svadd_f32_x(svptrue_b32(), v505, v495); + svfloat32_t v528 = svadd_f32_x(svptrue_b32(), v527, v524); + svfloat32_t v530 = svadd_f32_x(svptrue_b32(), v529, v526); + svfloat32_t v534 = svcmla_f32_x(pred_full, v533, v942, v221, 90); + svfloat32_t v537 = svadd_f32_x(svptrue_b32(), v536, v526); + svfloat32_t v551 = svsub_f32_x(svptrue_b32(), v509, v501); + svfloat32_t v555 = svsub_f32_x(svptrue_b32(), v516, v509); + svfloat32_t v558 = svadd_f32_x(svptrue_b32(), v501, v516); + svfloat32_t v563 = svsub_f32_x(svptrue_b32(), v540, v532); + svfloat32_t v567 = svsub_f32_x(svptrue_b32(), v540, v546); + svfloat32_t v570 = svadd_f32_x(svptrue_b32(), v532, v546); + svfloat32_t v504 = svadd_f32_x(svptrue_b32(), v503, v492); + svfloat32_t v507 = svadd_f32_x(svptrue_b32(), v506, v488); + svfloat32_t v535 = svadd_f32_x(svptrue_b32(), v534, v523); + svfloat32_t v538 = svadd_f32_x(svptrue_b32(), v537, v519); + svfloat32_t v552 = svadd_f32_x(svptrue_b32(), v551, v516); + svfloat32_t v556 = svadd_f32_x(svptrue_b32(), v497, v513); + svfloat32_t v557 = svadd_f32_x(svptrue_b32(), v499, v515); + svfloat32_t v564 = svadd_f32_x(svptrue_b32(), v563, v546); + svfloat32_t v568 = svadd_f32_x(svptrue_b32(), v528, v543); + svfloat32_t v569 = svadd_f32_x(svptrue_b32(), v530, v545); + svfloat32_t v597 = svsub_f32_x(svptrue_b32(), v558, v570); + svfloat32_t v606 = svadd_f32_x(svptrue_b32(), v558, v570); + svfloat32_t v615 = svadd_f32_x(svptrue_b32(), v555, v567); + svfloat32_t v624 = svsub_f32_x(svptrue_b32(), v555, v567); + svfloat32_t v547 = svsub_f32_x(svptrue_b32(), v504, v497); + svfloat32_t v549 = svsub_f32_x(svptrue_b32(), v507, v499); + svfloat32_t v553 = svsub_f32_x(svptrue_b32(), v513, v504); + svfloat32_t v554 = svsub_f32_x(svptrue_b32(), v515, v507); + svfloat32_t v559 = svsub_f32_x(svptrue_b32(), v535, v528); + svfloat32_t v561 = svsub_f32_x(svptrue_b32(), v538, v530); + svfloat32_t v565 = svsub_f32_x(svptrue_b32(), v543, v535); + svfloat32_t v566 = svsub_f32_x(svptrue_b32(), v545, v538); + svint16_t v600 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v597, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v609 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v606, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v618 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v615, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v627 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v624, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v633 = svadd_f32_x(svptrue_b32(), v557, v569); + svfloat32_t v642 = svsub_f32_x(svptrue_b32(), v557, v569); + svfloat32_t v651 = svadd_f32_x(svptrue_b32(), v552, v564); + svfloat32_t v660 = svsub_f32_x(svptrue_b32(), v552, v564); + svfloat32_t v705 = svsub_f32_x(svptrue_b32(), v556, v568); + svfloat32_t v714 = svadd_f32_x(svptrue_b32(), v556, v568); + svfloat32_t v548 = svadd_f32_x(svptrue_b32(), v547, v513); + svfloat32_t v550 = svadd_f32_x(svptrue_b32(), v549, v515); + svfloat32_t v560 = svadd_f32_x(svptrue_b32(), v559, v543); + svfloat32_t v562 = svadd_f32_x(svptrue_b32(), v561, v545); + svint16_t v636 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v633, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v645 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v642, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v654 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v651, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v663 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v660, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v669 = svadd_f32_x(svptrue_b32(), v554, v566); + svfloat32_t v678 = svsub_f32_x(svptrue_b32(), v554, v566); + svfloat32_t v687 = svadd_f32_x(svptrue_b32(), v553, v565); + svfloat32_t v696 = svsub_f32_x(svptrue_b32(), v553, v565); + svint16_t v708 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v705, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v717 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v714, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v986), svreinterpret_u64_s16(v600)); + svst1w_u64(pred_full, (unsigned *)(v995), svreinterpret_u64_s16(v609)); + svst1w_u64(pred_full, (unsigned *)(v1004), svreinterpret_u64_s16(v618)); + svst1w_u64(pred_full, (unsigned *)(v1013), svreinterpret_u64_s16(v627)); + svfloat32_t v579 = svadd_f32_x(svptrue_b32(), v548, v560); + svfloat32_t v588 = svsub_f32_x(svptrue_b32(), v548, v560); + svint16_t v672 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v669, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v681 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v678, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v690 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v687, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v699 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v696, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v723 = svadd_f32_x(svptrue_b32(), v550, v562); + svfloat32_t v732 = svsub_f32_x(svptrue_b32(), v550, v562); + svst1w_u64(pred_full, (unsigned *)(v1022), svreinterpret_u64_s16(v636)); + svst1w_u64(pred_full, (unsigned *)(v1031), svreinterpret_u64_s16(v645)); + svst1w_u64(pred_full, (unsigned *)(v1040), svreinterpret_u64_s16(v654)); + svst1w_u64(pred_full, (unsigned *)(v1049), svreinterpret_u64_s16(v663)); + svst1w_u64(pred_full, (unsigned *)(v1094), svreinterpret_u64_s16(v708)); + svst1w_u64(pred_full, (unsigned *)(v1103), svreinterpret_u64_s16(v717)); + svint16_t v582 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v579, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v591 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v588, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v726 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v723, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v735 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v732, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v1058), svreinterpret_u64_s16(v672)); + svst1w_u64(pred_full, (unsigned *)(v1067), svreinterpret_u64_s16(v681)); + svst1w_u64(pred_full, (unsigned *)(v1076), svreinterpret_u64_s16(v690)); + svst1w_u64(pred_full, (unsigned *)(v1085), svreinterpret_u64_s16(v699)); + svst1w_u64(pred_full, (unsigned *)(v968), svreinterpret_u64_s16(v582)); + svst1w_u64(pred_full, (unsigned *)(v977), svreinterpret_u64_s16(v591)); + svst1w_u64(pred_full, (unsigned *)(v1112), svreinterpret_u64_s16(v726)); + svst1w_u64(pred_full, (unsigned *)(v1121), svreinterpret_u64_s16(v735)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu20(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v154 = vld1s_s16(&v5[istride]); + float v286 = 1.5388417685876268e+00F; + float v293 = 5.8778525229247325e-01F; + float v300 = 3.6327126400268028e-01F; + float v324 = 1.0000000000000000e+00F; + float v325 = -1.0000000000000000e+00F; + float v331 = -1.2500000000000000e+00F; + float v332 = 1.2500000000000000e+00F; + float v338 = 5.5901699437494745e-01F; + float v339 = -5.5901699437494745e-01F; + float32x2_t v341 = (float32x2_t){v4, v4}; + float v346 = -1.5388417685876268e+00F; + float v350 = -5.8778525229247325e-01F; + float v354 = -3.6327126400268028e-01F; + int16x4_t v20 = vld1s_s16(&v5[0]); + float32x2_t v155 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v154)), 15); + float32x2_t v280 = (float32x2_t){v331, v331}; + float32x2_t v284 = (float32x2_t){v338, v338}; + float32x2_t v288 = (float32x2_t){v286, v346}; + float32x2_t v295 = (float32x2_t){v293, v350}; + float32x2_t v302 = (float32x2_t){v300, v354}; + float32x2_t v326 = (float32x2_t){v324, v325}; + float32x2_t v333 = (float32x2_t){v331, v332}; + float32x2_t v340 = (float32x2_t){v338, v339}; + float32x2_t v347 = (float32x2_t){v346, v346}; + float32x2_t v351 = (float32x2_t){v350, v350}; + float32x2_t v355 = (float32x2_t){v354, v354}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v26 = vld1s_s16(&v5[istride * 10]); + int16x4_t v34 = vld1s_s16(&v5[istride * 5]); + int16x4_t v40 = vld1s_s16(&v5[istride * 15]); + int16x4_t v50 = vld1s_s16(&v5[istride * 4]); + int16x4_t v56 = vld1s_s16(&v5[istride * 14]); + int16x4_t v64 = vld1s_s16(&v5[istride * 9]); + int16x4_t v70 = vld1s_s16(&v5[istride * 19]); + int16x4_t v80 = vld1s_s16(&v5[istride * 8]); + int16x4_t v86 = vld1s_s16(&v5[istride * 18]); + int16x4_t v94 = vld1s_s16(&v5[istride * 13]); + int16x4_t v100 = vld1s_s16(&v5[istride * 3]); + int16x4_t v110 = vld1s_s16(&v5[istride * 12]); + int16x4_t v116 = vld1s_s16(&v5[istride * 2]); + int16x4_t v124 = vld1s_s16(&v5[istride * 17]); + int16x4_t v130 = vld1s_s16(&v5[istride * 7]); + int16x4_t v140 = vld1s_s16(&v5[istride * 16]); + int16x4_t v146 = vld1s_s16(&v5[istride * 6]); + int16x4_t v160 = vld1s_s16(&v5[istride * 11]); + float32x2_t v290 = vmul_f32(v341, v288); + float32x2_t v297 = vmul_f32(v341, v295); + float32x2_t v304 = vmul_f32(v341, v302); + float32x2_t v328 = vmul_f32(v341, v326); + float32x2_t v335 = vmul_f32(v341, v333); + float32x2_t v342 = vmul_f32(v341, v340); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v51 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v50)), 15); + float32x2_t v57 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v56)), 15); + float32x2_t v65 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v64)), 15); + float32x2_t v71 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v70)), 15); + float32x2_t v81 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v80)), 15); + float32x2_t v87 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v86)), 15); + float32x2_t v95 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v94)), 15); + float32x2_t v101 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v100)), 15); + float32x2_t v111 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v110)), 15); + float32x2_t v117 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v116)), 15); + float32x2_t v125 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v124)), 15); + float32x2_t v131 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v130)), 15); + float32x2_t v141 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v140)), 15); + float32x2_t v147 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v146)), 15); + float32x2_t v161 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v160)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v58 = vadd_f32(v51, v57); + float32x2_t v59 = vsub_f32(v51, v57); + float32x2_t v72 = vadd_f32(v65, v71); + float32x2_t v73 = vsub_f32(v65, v71); + float32x2_t v88 = vadd_f32(v81, v87); + float32x2_t v89 = vsub_f32(v81, v87); + float32x2_t v102 = vadd_f32(v95, v101); + float32x2_t v103 = vsub_f32(v95, v101); + float32x2_t v118 = vadd_f32(v111, v117); + float32x2_t v119 = vsub_f32(v111, v117); + float32x2_t v132 = vadd_f32(v125, v131); + float32x2_t v133 = vsub_f32(v125, v131); + float32x2_t v148 = vadd_f32(v141, v147); + float32x2_t v149 = vsub_f32(v141, v147); + float32x2_t v162 = vadd_f32(v155, v161); + float32x2_t v163 = vsub_f32(v155, v161); + float32x2_t v44 = vadd_f32(v28, v42); + float32x2_t v45 = vsub_f32(v28, v42); + float32x2_t v74 = vadd_f32(v58, v72); + float32x2_t v75 = vsub_f32(v58, v72); + float32x2_t v104 = vadd_f32(v88, v102); + float32x2_t v105 = vsub_f32(v88, v102); + float32x2_t v134 = vadd_f32(v118, v132); + float32x2_t v135 = vsub_f32(v118, v132); + float32x2_t v164 = vadd_f32(v148, v162); + float32x2_t v165 = vsub_f32(v148, v162); + float32x2_t v266 = vadd_f32(v59, v149); + float32x2_t v267 = vsub_f32(v59, v149); + float32x2_t v268 = vadd_f32(v119, v89); + float32x2_t v269 = vsub_f32(v119, v89); + float32x2_t v316 = vadd_f32(v73, v163); + float32x2_t v317 = vsub_f32(v73, v163); + float32x2_t v318 = vadd_f32(v133, v103); + float32x2_t v319 = vsub_f32(v133, v103); + float32x2_t v166 = vadd_f32(v74, v164); + float32x2_t v167 = vsub_f32(v74, v164); + float32x2_t v168 = vadd_f32(v134, v104); + float32x2_t v169 = vsub_f32(v134, v104); + float32x2_t v216 = vadd_f32(v75, v165); + float32x2_t v217 = vsub_f32(v75, v165); + float32x2_t v218 = vadd_f32(v135, v105); + float32x2_t v219 = vsub_f32(v135, v105); + float32x2_t v270 = vadd_f32(v266, v268); + float32x2_t v271 = vsub_f32(v266, v268); + float32x2_t v272 = vadd_f32(v267, v269); + float32x2_t v291 = vrev64_f32(v267); + float32x2_t v305 = vrev64_f32(v269); + float32x2_t v320 = vadd_f32(v316, v318); + float32x2_t v321 = vsub_f32(v316, v318); + float32x2_t v322 = vadd_f32(v317, v319); + float32x2_t v348 = vmul_f32(v317, v347); + float32x2_t v356 = vmul_f32(v319, v355); + float32x2_t v170 = vadd_f32(v166, v168); + float32x2_t v171 = vsub_f32(v166, v168); + float32x2_t v172 = vadd_f32(v167, v169); + float32x2_t v191 = vrev64_f32(v167); + float32x2_t v205 = vrev64_f32(v169); + float32x2_t v220 = vadd_f32(v216, v218); + float32x2_t v221 = vsub_f32(v216, v218); + float32x2_t v222 = vadd_f32(v217, v219); + float32x2_t v241 = vrev64_f32(v217); + float32x2_t v255 = vrev64_f32(v219); + float32x2_t v273 = vadd_f32(v270, v29); + float32x2_t v281 = vmul_f32(v270, v280); + float32x2_t v285 = vmul_f32(v271, v284); + float32x2_t v292 = vmul_f32(v291, v290); + float32x2_t v298 = vrev64_f32(v272); + float32x2_t v306 = vmul_f32(v305, v304); + float32x2_t v323 = vadd_f32(v320, v43); + float32x2_t v336 = vrev64_f32(v320); + float32x2_t v343 = vrev64_f32(v321); + float32x2_t v352 = vmul_f32(v322, v351); + float32x2_t v173 = vadd_f32(v170, v44); + float32x2_t v181 = vmul_f32(v170, v280); + float32x2_t v185 = vmul_f32(v171, v284); + float32x2_t v192 = vmul_f32(v191, v290); + float32x2_t v198 = vrev64_f32(v172); + float32x2_t v206 = vmul_f32(v205, v304); + float32x2_t v223 = vadd_f32(v220, v45); + float32x2_t v231 = vmul_f32(v220, v280); + float32x2_t v235 = vmul_f32(v221, v284); + float32x2_t v242 = vmul_f32(v241, v290); + float32x2_t v248 = vrev64_f32(v222); + float32x2_t v256 = vmul_f32(v255, v304); + float32x2_t v299 = vmul_f32(v298, v297); + float32x2_t v307 = vadd_f32(v273, v281); + float32x2_t v329 = vrev64_f32(v323); + float32x2_t v337 = vmul_f32(v336, v335); + float32x2_t v344 = vmul_f32(v343, v342); + float32x2_t v360 = vsub_f32(v348, v352); + float32x2_t v361 = vadd_f32(v352, v356); + float32x2_t v199 = vmul_f32(v198, v297); + float32x2_t v207 = vadd_f32(v173, v181); + float32x2_t v249 = vmul_f32(v248, v297); + float32x2_t v257 = vadd_f32(v223, v231); + float32x2_t v308 = vadd_f32(v307, v285); + float32x2_t v309 = vsub_f32(v307, v285); + float32x2_t v310 = vsub_f32(v292, v299); + float32x2_t v311 = vadd_f32(v299, v306); + float32x2_t v330 = vmul_f32(v329, v328); + int16x4_t v370 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v173, 15), (int32x2_t){0, 0})); + int16x4_t v382 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v223, 15), (int32x2_t){0, 0})); + float32x2_t v208 = vadd_f32(v207, v185); + float32x2_t v209 = vsub_f32(v207, v185); + float32x2_t v210 = vsub_f32(v192, v199); + float32x2_t v211 = vadd_f32(v199, v206); + float32x2_t v258 = vadd_f32(v257, v235); + float32x2_t v259 = vsub_f32(v257, v235); + float32x2_t v260 = vsub_f32(v242, v249); + float32x2_t v261 = vadd_f32(v249, v256); + float32x2_t v312 = vadd_f32(v308, v310); + float32x2_t v313 = vsub_f32(v308, v310); + float32x2_t v314 = vadd_f32(v309, v311); + float32x2_t v315 = vsub_f32(v309, v311); + float32x2_t v357 = vadd_f32(v330, v337); + float32x2_t v366 = vadd_f32(v273, v330); + float32x2_t v367 = vsub_f32(v273, v330); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v370), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v382), 0); + float32x2_t v212 = vadd_f32(v208, v210); + float32x2_t v213 = vsub_f32(v208, v210); + float32x2_t v214 = vadd_f32(v209, v211); + float32x2_t v215 = vsub_f32(v209, v211); + float32x2_t v262 = vadd_f32(v258, v260); + float32x2_t v263 = vsub_f32(v258, v260); + float32x2_t v264 = vadd_f32(v259, v261); + float32x2_t v265 = vsub_f32(v259, v261); + float32x2_t v358 = vadd_f32(v357, v344); + float32x2_t v359 = vsub_f32(v357, v344); + int16x4_t v376 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v367, 15), (int32x2_t){0, 0})); + int16x4_t v388 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v366, 15), (int32x2_t){0, 0})); + float32x2_t v362 = vadd_f32(v358, v360); + float32x2_t v363 = vsub_f32(v358, v360); + float32x2_t v364 = vadd_f32(v359, v361); + float32x2_t v365 = vsub_f32(v359, v361); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v376), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v388), 0); + int16x4_t v396 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v213, 15), (int32x2_t){0, 0})); + int16x4_t v408 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v263, 15), (int32x2_t){0, 0})); + int16x4_t v422 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v215, 15), (int32x2_t){0, 0})); + int16x4_t v434 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v265, 15), (int32x2_t){0, 0})); + int16x4_t v448 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v214, 15), (int32x2_t){0, 0})); + int16x4_t v460 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v264, 15), (int32x2_t){0, 0})); + int16x4_t v474 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v212, 15), (int32x2_t){0, 0})); + int16x4_t v486 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v262, 15), (int32x2_t){0, 0})); + float32x2_t v392 = vadd_f32(v313, v363); + float32x2_t v393 = vsub_f32(v313, v363); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v396), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v408), 0); + float32x2_t v418 = vadd_f32(v315, v365); + float32x2_t v419 = vsub_f32(v315, v365); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v422), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v434), 0); + float32x2_t v444 = vadd_f32(v314, v364); + float32x2_t v445 = vsub_f32(v314, v364); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v448), 0); + v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v460), 0); + float32x2_t v470 = vadd_f32(v312, v362); + float32x2_t v471 = vsub_f32(v312, v362); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v474), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v486), 0); + int16x4_t v402 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v393, 15), (int32x2_t){0, 0})); + int16x4_t v414 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v392, 15), (int32x2_t){0, 0})); + int16x4_t v428 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v419, 15), (int32x2_t){0, 0})); + int16x4_t v440 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v418, 15), (int32x2_t){0, 0})); + int16x4_t v454 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v445, 15), (int32x2_t){0, 0})); + int16x4_t v466 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v444, 15), (int32x2_t){0, 0})); + int16x4_t v480 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v471, 15), (int32x2_t){0, 0})); + int16x4_t v492 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v470, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v402), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v414), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v428), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v440), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v454), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v466), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v480), 0); + v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v492), 0); + v5 += 1 * 1; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu20(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * 1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v328 = -1.2500000000000000e+00F; + float v333 = 5.5901699437494745e-01F; + float v376 = -1.0000000000000000e+00F; + float v383 = 1.2500000000000000e+00F; + float v390 = -5.5901699437494745e-01F; + float v397 = -1.5388417685876268e+00F; + float v402 = -5.8778525229247325e-01F; + float v407 = -3.6327126400268028e-01F; + const int32_t *v759 = &v5[v0]; + int32_t *v847 = &v6[v2]; + int64_t v27 = v0 * 10; + int64_t v37 = v0 * 5; + int64_t v45 = v0 * 15; + int64_t v57 = v0 * 4; + int64_t v65 = v0 * 14; + int64_t v75 = v0 * 9; + int64_t v83 = v0 * 19; + int64_t v95 = v0 * 8; + int64_t v103 = v0 * 18; + int64_t v113 = v0 * 13; + int64_t v121 = v0 * 3; + int64_t v133 = v0 * 12; + int64_t v141 = v0 * 2; + int64_t v151 = v0 * 17; + int64_t v159 = v0 * 7; + int64_t v171 = v0 * 16; + int64_t v179 = v0 * 6; + int64_t v197 = v0 * 11; + float v341 = v4 * v397; + float v348 = v4 * v402; + float v355 = v4 * v407; + float v379 = v4 * v376; + float v386 = v4 * v383; + float v393 = v4 * v390; + int64_t v431 = v2 * 5; + int64_t v439 = v2 * 10; + int64_t v447 = v2 * 15; + int64_t v457 = v2 * 16; + int64_t v473 = v2 * 6; + int64_t v481 = v2 * 11; + int64_t v491 = v2 * 12; + int64_t v499 = v2 * 17; + int64_t v507 = v2 * 2; + int64_t v515 = v2 * 7; + int64_t v525 = v2 * 8; + int64_t v533 = v2 * 13; + int64_t v541 = v2 * 18; + int64_t v549 = v2 * 3; + int64_t v559 = v2 * 4; + int64_t v567 = v2 * 9; + int64_t v575 = v2 * 14; + int64_t v583 = v2 * 19; + const int32_t *v597 = &v5[0]; + svfloat32_t v784 = svdup_n_f32(v328); + svfloat32_t v785 = svdup_n_f32(v333); + svfloat32_t v792 = svdup_n_f32(v397); + svfloat32_t v793 = svdup_n_f32(v402); + svfloat32_t v794 = svdup_n_f32(v407); + int32_t *v802 = &v6[0]; + svfloat32_t v195 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v759[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v606 = &v5[v27]; + const int32_t *v615 = &v5[v37]; + const int32_t *v624 = &v5[v45]; + const int32_t *v633 = &v5[v57]; + const int32_t *v642 = &v5[v65]; + const int32_t *v651 = &v5[v75]; + const int32_t *v660 = &v5[v83]; + const int32_t *v669 = &v5[v95]; + const int32_t *v678 = &v5[v103]; + const int32_t *v687 = &v5[v113]; + const int32_t *v696 = &v5[v121]; + const int32_t *v705 = &v5[v133]; + const int32_t *v714 = &v5[v141]; + const int32_t *v723 = &v5[v151]; + const int32_t *v732 = &v5[v159]; + const int32_t *v741 = &v5[v171]; + const int32_t *v750 = &v5[v179]; + const int32_t *v768 = &v5[v197]; + svfloat32_t v786 = svdup_n_f32(v341); + svfloat32_t v787 = svdup_n_f32(v348); + svfloat32_t v788 = svdup_n_f32(v355); + svfloat32_t v789 = svdup_n_f32(v379); + svfloat32_t v790 = svdup_n_f32(v386); + svfloat32_t v791 = svdup_n_f32(v393); + int32_t *v811 = &v6[v431]; + int32_t *v820 = &v6[v439]; + int32_t *v829 = &v6[v447]; + int32_t *v838 = &v6[v457]; + int32_t *v856 = &v6[v473]; + int32_t *v865 = &v6[v481]; + int32_t *v874 = &v6[v491]; + int32_t *v883 = &v6[v499]; + int32_t *v892 = &v6[v507]; + int32_t *v901 = &v6[v515]; + int32_t *v910 = &v6[v525]; + int32_t *v919 = &v6[v533]; + int32_t *v928 = &v6[v541]; + int32_t *v937 = &v6[v549]; + int32_t *v946 = &v6[v559]; + int32_t *v955 = &v6[v567]; + int32_t *v964 = &v6[v575]; + int32_t *v973 = &v6[v583]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v597[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v606[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v615[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v51 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v624[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v63 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v633[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v71 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v642[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v81 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v651[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v89 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v660[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v101 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v669[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v109 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v678[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v119 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v687[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v127 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v696[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v139 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v705[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v147 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v714[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v157 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v723[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v165 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v732[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v177 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v741[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v185 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v750[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v203 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v768[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v72 = svadd_f32_x(svptrue_b32(), v63, v71); + svfloat32_t v73 = svsub_f32_x(svptrue_b32(), v63, v71); + svfloat32_t v90 = svadd_f32_x(svptrue_b32(), v81, v89); + svfloat32_t v91 = svsub_f32_x(svptrue_b32(), v81, v89); + svfloat32_t v110 = svadd_f32_x(svptrue_b32(), v101, v109); + svfloat32_t v111 = svsub_f32_x(svptrue_b32(), v101, v109); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v119, v127); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v119, v127); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v139, v147); + svfloat32_t v149 = svsub_f32_x(svptrue_b32(), v139, v147); + svfloat32_t v166 = svadd_f32_x(svptrue_b32(), v157, v165); + svfloat32_t v167 = svsub_f32_x(svptrue_b32(), v157, v165); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v177, v185); + svfloat32_t v187 = svsub_f32_x(svptrue_b32(), v177, v185); + svfloat32_t v204 = svadd_f32_x(svptrue_b32(), v195, v203); + svfloat32_t v205 = svsub_f32_x(svptrue_b32(), v195, v203); + svfloat32_t v54 = svadd_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v55 = svsub_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v92 = svadd_f32_x(svptrue_b32(), v72, v90); + svfloat32_t v93 = svsub_f32_x(svptrue_b32(), v72, v90); + svfloat32_t v130 = svadd_f32_x(svptrue_b32(), v110, v128); + svfloat32_t v131 = svsub_f32_x(svptrue_b32(), v110, v128); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v148, v166); + svfloat32_t v169 = svsub_f32_x(svptrue_b32(), v148, v166); + svfloat32_t v206 = svadd_f32_x(svptrue_b32(), v186, v204); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v186, v204); + svfloat32_t v314 = svadd_f32_x(svptrue_b32(), v73, v187); + svfloat32_t v315 = svsub_f32_x(svptrue_b32(), v73, v187); + svfloat32_t v316 = svadd_f32_x(svptrue_b32(), v149, v111); + svfloat32_t v317 = svsub_f32_x(svptrue_b32(), v149, v111); + svfloat32_t v367 = svadd_f32_x(svptrue_b32(), v91, v205); + svfloat32_t v368 = svsub_f32_x(svptrue_b32(), v91, v205); + svfloat32_t v369 = svadd_f32_x(svptrue_b32(), v167, v129); + svfloat32_t v370 = svsub_f32_x(svptrue_b32(), v167, v129); + svfloat32_t v208 = svadd_f32_x(svptrue_b32(), v92, v206); + svfloat32_t v209 = svsub_f32_x(svptrue_b32(), v92, v206); + svfloat32_t v210 = svadd_f32_x(svptrue_b32(), v168, v130); + svfloat32_t v211 = svsub_f32_x(svptrue_b32(), v168, v130); + svfloat32_t v261 = svadd_f32_x(svptrue_b32(), v93, v207); + svfloat32_t v262 = svsub_f32_x(svptrue_b32(), v93, v207); + svfloat32_t v263 = svadd_f32_x(svptrue_b32(), v169, v131); + svfloat32_t v264 = svsub_f32_x(svptrue_b32(), v169, v131); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v314, v316); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v314, v316); + svfloat32_t v320 = svadd_f32_x(svptrue_b32(), v315, v317); + svfloat32_t zero343 = svdup_n_f32(0); + svfloat32_t v343 = svcmla_f32_x(pred_full, zero343, v786, v315, 90); + svfloat32_t v371 = svadd_f32_x(svptrue_b32(), v367, v369); + svfloat32_t v372 = svsub_f32_x(svptrue_b32(), v367, v369); + svfloat32_t v373 = svadd_f32_x(svptrue_b32(), v368, v370); + svfloat32_t v410 = svmul_f32_x(svptrue_b32(), v370, v794); + svfloat32_t v212 = svadd_f32_x(svptrue_b32(), v208, v210); + svfloat32_t v213 = svsub_f32_x(svptrue_b32(), v208, v210); + svfloat32_t v214 = svadd_f32_x(svptrue_b32(), v209, v211); + svfloat32_t zero237 = svdup_n_f32(0); + svfloat32_t v237 = svcmla_f32_x(pred_full, zero237, v786, v209, 90); + svfloat32_t v265 = svadd_f32_x(svptrue_b32(), v261, v263); + svfloat32_t v266 = svsub_f32_x(svptrue_b32(), v261, v263); + svfloat32_t v267 = svadd_f32_x(svptrue_b32(), v262, v264); + svfloat32_t zero290 = svdup_n_f32(0); + svfloat32_t v290 = svcmla_f32_x(pred_full, zero290, v786, v262, 90); + svfloat32_t v321 = svadd_f32_x(svptrue_b32(), v318, v35); + svfloat32_t zero350 = svdup_n_f32(0); + svfloat32_t v350 = svcmla_f32_x(pred_full, zero350, v787, v320, 90); + svfloat32_t v374 = svadd_f32_x(svptrue_b32(), v371, v53); + svfloat32_t zero395 = svdup_n_f32(0); + svfloat32_t v395 = svcmla_f32_x(pred_full, zero395, v791, v372, 90); + svfloat32_t v405 = svmul_f32_x(svptrue_b32(), v373, v793); + svfloat32_t v215 = svadd_f32_x(svptrue_b32(), v212, v54); + svfloat32_t zero244 = svdup_n_f32(0); + svfloat32_t v244 = svcmla_f32_x(pred_full, zero244, v787, v214, 90); + svfloat32_t v268 = svadd_f32_x(svptrue_b32(), v265, v55); + svfloat32_t zero297 = svdup_n_f32(0); + svfloat32_t v297 = svcmla_f32_x(pred_full, zero297, v787, v267, 90); + svfloat32_t v358 = svmla_f32_x(pred_full, v321, v318, v784); + svfloat32_t v361 = svsub_f32_x(svptrue_b32(), v343, v350); + svfloat32_t v362 = svcmla_f32_x(pred_full, v350, v788, v317, 90); + svfloat32_t zero381 = svdup_n_f32(0); + svfloat32_t v381 = svcmla_f32_x(pred_full, zero381, v789, v374, 90); + svfloat32_t v414 = svnmls_f32_x(pred_full, v405, v368, v792); + svfloat32_t v415 = svmla_f32_x(pred_full, v410, v373, v793); + svfloat32_t v252 = svmla_f32_x(pred_full, v215, v212, v784); + svfloat32_t v255 = svsub_f32_x(svptrue_b32(), v237, v244); + svfloat32_t v256 = svcmla_f32_x(pred_full, v244, v788, v211, 90); + svfloat32_t v305 = svmla_f32_x(pred_full, v268, v265, v784); + svfloat32_t v308 = svsub_f32_x(svptrue_b32(), v290, v297); + svfloat32_t v309 = svcmla_f32_x(pred_full, v297, v788, v264, 90); + svfloat32_t v359 = svmla_f32_x(pred_full, v358, v319, v785); + svfloat32_t v360 = svmls_f32_x(pred_full, v358, v319, v785); + svfloat32_t v411 = svcmla_f32_x(pred_full, v381, v790, v371, 90); + svfloat32_t v420 = svadd_f32_x(svptrue_b32(), v321, v381); + svfloat32_t v421 = svsub_f32_x(svptrue_b32(), v321, v381); + svint16_t v424 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v215, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v440 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v268, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v253 = svmla_f32_x(pred_full, v252, v213, v785); + svfloat32_t v254 = svmls_f32_x(pred_full, v252, v213, v785); + svfloat32_t v306 = svmla_f32_x(pred_full, v305, v266, v785); + svfloat32_t v307 = svmls_f32_x(pred_full, v305, v266, v785); + svfloat32_t v363 = svadd_f32_x(svptrue_b32(), v359, v361); + svfloat32_t v364 = svsub_f32_x(svptrue_b32(), v359, v361); + svfloat32_t v365 = svadd_f32_x(svptrue_b32(), v360, v362); + svfloat32_t v366 = svsub_f32_x(svptrue_b32(), v360, v362); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v411, v395); + svfloat32_t v413 = svsub_f32_x(svptrue_b32(), v411, v395); + svint16_t v432 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v421, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v448 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v420, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v802), svreinterpret_u64_s16(v424)); + svst1w_u64(pred_full, (unsigned *)(v820), svreinterpret_u64_s16(v440)); + svfloat32_t v257 = svadd_f32_x(svptrue_b32(), v253, v255); + svfloat32_t v258 = svsub_f32_x(svptrue_b32(), v253, v255); + svfloat32_t v259 = svadd_f32_x(svptrue_b32(), v254, v256); + svfloat32_t v260 = svsub_f32_x(svptrue_b32(), v254, v256); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v306, v308); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v306, v308); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v307, v309); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v307, v309); + svfloat32_t v416 = svadd_f32_x(svptrue_b32(), v412, v414); + svfloat32_t v417 = svsub_f32_x(svptrue_b32(), v412, v414); + svfloat32_t v418 = svadd_f32_x(svptrue_b32(), v413, v415); + svfloat32_t v419 = svsub_f32_x(svptrue_b32(), v413, v415); + svst1w_u64(pred_full, (unsigned *)(v811), svreinterpret_u64_s16(v432)); + svst1w_u64(pred_full, (unsigned *)(v829), svreinterpret_u64_s16(v448)); + svfloat32_t v454 = svadd_f32_x(svptrue_b32(), v364, v417); + svfloat32_t v455 = svsub_f32_x(svptrue_b32(), v364, v417); + svint16_t v458 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v258, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v474 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v311, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v488 = svadd_f32_x(svptrue_b32(), v366, v419); + svfloat32_t v489 = svsub_f32_x(svptrue_b32(), v366, v419); + svint16_t v492 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v260, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v508 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v313, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v522 = svadd_f32_x(svptrue_b32(), v365, v418); + svfloat32_t v523 = svsub_f32_x(svptrue_b32(), v365, v418); + svint16_t v526 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v259, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v542 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v312, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v556 = svadd_f32_x(svptrue_b32(), v363, v416); + svfloat32_t v557 = svsub_f32_x(svptrue_b32(), v363, v416); + svint16_t v560 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v257, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v576 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v310, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v466 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v455, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v482 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v454, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v500 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v489, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v516 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v488, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v534 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v523, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v550 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v522, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v568 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v557, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v584 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v556, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v838), svreinterpret_u64_s16(v458)); + svst1w_u64(pred_full, (unsigned *)(v856), svreinterpret_u64_s16(v474)); + svst1w_u64(pred_full, (unsigned *)(v874), svreinterpret_u64_s16(v492)); + svst1w_u64(pred_full, (unsigned *)(v892), svreinterpret_u64_s16(v508)); + svst1w_u64(pred_full, (unsigned *)(v910), svreinterpret_u64_s16(v526)); + svst1w_u64(pred_full, (unsigned *)(v928), svreinterpret_u64_s16(v542)); + svst1w_u64(pred_full, (unsigned *)(v946), svreinterpret_u64_s16(v560)); + svst1w_u64(pred_full, (unsigned *)(v964), svreinterpret_u64_s16(v576)); + svst1w_u64(pred_full, (unsigned *)(v847), svreinterpret_u64_s16(v466)); + svst1w_u64(pred_full, (unsigned *)(v865), svreinterpret_u64_s16(v482)); + svst1w_u64(pred_full, (unsigned *)(v883), svreinterpret_u64_s16(v500)); + svst1w_u64(pred_full, (unsigned *)(v901), svreinterpret_u64_s16(v516)); + svst1w_u64(pred_full, (unsigned *)(v919), svreinterpret_u64_s16(v534)); + svst1w_u64(pred_full, (unsigned *)(v937), svreinterpret_u64_s16(v550)); + svst1w_u64(pred_full, (unsigned *)(v955), svreinterpret_u64_s16(v568)); + svst1w_u64(pred_full, (unsigned *)(v973), svreinterpret_u64_s16(v584)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu21(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v125 = vld1s_s16(&v5[istride]); + float v185 = -1.1666666666666665e+00F; + float v189 = 7.9015646852540022e-01F; + float v193 = 5.5854267289647742e-02F; + float v197 = 7.3430220123575241e-01F; + float v200 = 4.4095855184409838e-01F; + float v201 = -4.4095855184409838e-01F; + float v207 = 3.4087293062393137e-01F; + float v208 = -3.4087293062393137e-01F; + float v214 = -5.3396936033772524e-01F; + float v215 = 5.3396936033772524e-01F; + float v221 = 8.7484229096165667e-01F; + float v222 = -8.7484229096165667e-01F; + float v265 = -1.4999999999999998e+00F; + float v269 = 1.7499999999999996e+00F; + float v273 = -1.1852347027881001e+00F; + float v277 = -8.3781400934471603e-02F; + float v281 = -1.1014533018536286e+00F; + float v284 = -6.6143782776614746e-01F; + float v285 = 6.6143782776614746e-01F; + float v291 = -5.1130939593589697e-01F; + float v292 = 5.1130939593589697e-01F; + float v298 = 8.0095404050658769e-01F; + float v299 = -8.0095404050658769e-01F; + float v305 = -1.3122634364424848e+00F; + float v306 = 1.3122634364424848e+00F; + float v348 = 8.6602540378443871e-01F; + float v349 = -8.6602540378443871e-01F; + float v355 = -1.0103629710818451e+00F; + float v356 = 1.0103629710818451e+00F; + float v362 = 6.8429557470759583e-01F; + float v363 = -6.8429557470759583e-01F; + float v369 = 4.8371214382601155e-02F; + float v370 = -4.8371214382601155e-02F; + float v376 = 6.3592436032499466e-01F; + float v377 = -6.3592436032499466e-01F; + float32x2_t v379 = (float32x2_t){v4, v4}; + float v384 = -3.8188130791298663e-01F; + float v388 = -2.9520461738277515e-01F; + float v392 = 4.6243103089499693e-01F; + float v396 = -7.5763564827777208e-01F; + int16x4_t v34 = vld1s_s16(&v5[0]); + float32x2_t v126 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v125)), 15); + float32x2_t v186 = (float32x2_t){v185, v185}; + float32x2_t v190 = (float32x2_t){v189, v189}; + float32x2_t v194 = (float32x2_t){v193, v193}; + float32x2_t v198 = (float32x2_t){v197, v197}; + float32x2_t v202 = (float32x2_t){v200, v201}; + float32x2_t v209 = (float32x2_t){v207, v208}; + float32x2_t v216 = (float32x2_t){v214, v215}; + float32x2_t v223 = (float32x2_t){v221, v222}; + float32x2_t v266 = (float32x2_t){v265, v265}; + float32x2_t v270 = (float32x2_t){v269, v269}; + float32x2_t v274 = (float32x2_t){v273, v273}; + float32x2_t v278 = (float32x2_t){v277, v277}; + float32x2_t v282 = (float32x2_t){v281, v281}; + float32x2_t v286 = (float32x2_t){v284, v285}; + float32x2_t v293 = (float32x2_t){v291, v292}; + float32x2_t v300 = (float32x2_t){v298, v299}; + float32x2_t v307 = (float32x2_t){v305, v306}; + float32x2_t v350 = (float32x2_t){v348, v349}; + float32x2_t v357 = (float32x2_t){v355, v356}; + float32x2_t v364 = (float32x2_t){v362, v363}; + float32x2_t v371 = (float32x2_t){v369, v370}; + float32x2_t v378 = (float32x2_t){v376, v377}; + float32x2_t v385 = (float32x2_t){v384, v384}; + float32x2_t v389 = (float32x2_t){v388, v388}; + float32x2_t v393 = (float32x2_t){v392, v392}; + float32x2_t v397 = (float32x2_t){v396, v396}; + int16x4_t v20 = vld1s_s16(&v5[istride * 7]); + int16x4_t v26 = vld1s_s16(&v5[istride * 14]); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + int16x4_t v41 = vld1s_s16(&v5[istride * 10]); + int16x4_t v47 = vld1s_s16(&v5[istride * 17]); + int16x4_t v55 = vld1s_s16(&v5[istride * 3]); + int16x4_t v62 = vld1s_s16(&v5[istride * 13]); + int16x4_t v68 = vld1s_s16(&v5[istride * 20]); + int16x4_t v76 = vld1s_s16(&v5[istride * 6]); + int16x4_t v83 = vld1s_s16(&v5[istride * 16]); + int16x4_t v89 = vld1s_s16(&v5[istride * 2]); + int16x4_t v97 = vld1s_s16(&v5[istride * 9]); + int16x4_t v104 = vld1s_s16(&v5[istride * 19]); + int16x4_t v110 = vld1s_s16(&v5[istride * 5]); + int16x4_t v118 = vld1s_s16(&v5[istride * 12]); + int16x4_t v131 = vld1s_s16(&v5[istride * 8]); + int16x4_t v139 = vld1s_s16(&v5[istride * 15]); + int16x4_t v146 = vld1s_s16(&v5[istride * 4]); + int16x4_t v152 = vld1s_s16(&v5[istride * 11]); + int16x4_t v160 = vld1s_s16(&v5[istride * 18]); + float32x2_t v204 = vmul_f32(v379, v202); + float32x2_t v211 = vmul_f32(v379, v209); + float32x2_t v218 = vmul_f32(v379, v216); + float32x2_t v225 = vmul_f32(v379, v223); + float32x2_t v288 = vmul_f32(v379, v286); + float32x2_t v295 = vmul_f32(v379, v293); + float32x2_t v302 = vmul_f32(v379, v300); + float32x2_t v309 = vmul_f32(v379, v307); + float32x2_t v352 = vmul_f32(v379, v350); + float32x2_t v359 = vmul_f32(v379, v357); + float32x2_t v366 = vmul_f32(v379, v364); + float32x2_t v373 = vmul_f32(v379, v371); + float32x2_t v380 = vmul_f32(v379, v378); + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v42 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v41)), 15); + float32x2_t v48 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v47)), 15); + float32x2_t v56 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v55)), 15); + float32x2_t v63 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v62)), 15); + float32x2_t v69 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v68)), 15); + float32x2_t v77 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v76)), 15); + float32x2_t v84 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v83)), 15); + float32x2_t v90 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v89)), 15); + float32x2_t v98 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v97)), 15); + float32x2_t v105 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v104)), 15); + float32x2_t v111 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v110)), 15); + float32x2_t v119 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v118)), 15); + float32x2_t v132 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v131)), 15); + float32x2_t v140 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v139)), 15); + float32x2_t v147 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v146)), 15); + float32x2_t v153 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v152)), 15); + float32x2_t v161 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v160)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v49 = vadd_f32(v42, v48); + float32x2_t v50 = vsub_f32(v42, v48); + float32x2_t v70 = vadd_f32(v63, v69); + float32x2_t v71 = vsub_f32(v63, v69); + float32x2_t v91 = vadd_f32(v84, v90); + float32x2_t v92 = vsub_f32(v84, v90); + float32x2_t v112 = vadd_f32(v105, v111); + float32x2_t v113 = vsub_f32(v105, v111); + float32x2_t v133 = vadd_f32(v126, v132); + float32x2_t v134 = vsub_f32(v126, v132); + float32x2_t v154 = vadd_f32(v147, v153); + float32x2_t v155 = vsub_f32(v147, v153); + float32x2_t v36 = vadd_f32(v28, v35); + float32x2_t v57 = vadd_f32(v49, v56); + float32x2_t v78 = vadd_f32(v70, v77); + float32x2_t v99 = vadd_f32(v91, v98); + float32x2_t v120 = vadd_f32(v112, v119); + float32x2_t v141 = vadd_f32(v133, v140); + float32x2_t v162 = vadd_f32(v154, v161); + float32x2_t v247 = vadd_f32(v49, v154); + float32x2_t v248 = vsub_f32(v49, v154); + float32x2_t v249 = vadd_f32(v112, v91); + float32x2_t v250 = vsub_f32(v112, v91); + float32x2_t v251 = vadd_f32(v70, v133); + float32x2_t v252 = vsub_f32(v70, v133); + float32x2_t v331 = vadd_f32(v50, v155); + float32x2_t v332 = vsub_f32(v50, v155); + float32x2_t v333 = vadd_f32(v113, v92); + float32x2_t v334 = vsub_f32(v113, v92); + float32x2_t v335 = vadd_f32(v71, v134); + float32x2_t v336 = vsub_f32(v71, v134); + float32x2_t v163 = vadd_f32(v57, v162); + float32x2_t v164 = vsub_f32(v57, v162); + float32x2_t v165 = vadd_f32(v120, v99); + float32x2_t v166 = vsub_f32(v120, v99); + float32x2_t v167 = vadd_f32(v78, v141); + float32x2_t v168 = vsub_f32(v78, v141); + float32x2_t v253 = vadd_f32(v247, v249); + float32x2_t v256 = vsub_f32(v247, v249); + float32x2_t v257 = vsub_f32(v249, v251); + float32x2_t v258 = vsub_f32(v251, v247); + float32x2_t v259 = vadd_f32(v248, v250); + float32x2_t v261 = vsub_f32(v248, v250); + float32x2_t v262 = vsub_f32(v250, v252); + float32x2_t v263 = vsub_f32(v252, v248); + float32x2_t v337 = vadd_f32(v331, v333); + float32x2_t v340 = vsub_f32(v331, v333); + float32x2_t v341 = vsub_f32(v333, v335); + float32x2_t v342 = vsub_f32(v335, v331); + float32x2_t v343 = vadd_f32(v332, v334); + float32x2_t v345 = vsub_f32(v332, v334); + float32x2_t v346 = vsub_f32(v334, v336); + float32x2_t v347 = vsub_f32(v336, v332); + float32x2_t v169 = vadd_f32(v163, v165); + float32x2_t v172 = vsub_f32(v163, v165); + float32x2_t v173 = vsub_f32(v165, v167); + float32x2_t v174 = vsub_f32(v167, v163); + float32x2_t v175 = vadd_f32(v164, v166); + float32x2_t v177 = vsub_f32(v164, v166); + float32x2_t v178 = vsub_f32(v166, v168); + float32x2_t v179 = vsub_f32(v168, v164); + float32x2_t v254 = vadd_f32(v253, v251); + float32x2_t v260 = vadd_f32(v259, v252); + float32x2_t v275 = vmul_f32(v256, v274); + float32x2_t v279 = vmul_f32(v257, v278); + float32x2_t v283 = vmul_f32(v258, v282); + float32x2_t v296 = vrev64_f32(v261); + float32x2_t v303 = vrev64_f32(v262); + float32x2_t v310 = vrev64_f32(v263); + float32x2_t v338 = vadd_f32(v337, v335); + float32x2_t v344 = vadd_f32(v343, v336); + float32x2_t v367 = vrev64_f32(v340); + float32x2_t v374 = vrev64_f32(v341); + float32x2_t v381 = vrev64_f32(v342); + float32x2_t v390 = vmul_f32(v345, v389); + float32x2_t v394 = vmul_f32(v346, v393); + float32x2_t v398 = vmul_f32(v347, v397); + float32x2_t v170 = vadd_f32(v169, v167); + float32x2_t v176 = vadd_f32(v175, v168); + float32x2_t v191 = vmul_f32(v172, v190); + float32x2_t v195 = vmul_f32(v173, v194); + float32x2_t v199 = vmul_f32(v174, v198); + float32x2_t v212 = vrev64_f32(v177); + float32x2_t v219 = vrev64_f32(v178); + float32x2_t v226 = vrev64_f32(v179); + float32x2_t v255 = vadd_f32(v254, v28); + float32x2_t v271 = vmul_f32(v254, v270); + float32x2_t v289 = vrev64_f32(v260); + float32x2_t v297 = vmul_f32(v296, v295); + float32x2_t v304 = vmul_f32(v303, v302); + float32x2_t v311 = vmul_f32(v310, v309); + float32x2_t v339 = vadd_f32(v338, v29); + float32x2_t v360 = vrev64_f32(v338); + float32x2_t v368 = vmul_f32(v367, v366); + float32x2_t v375 = vmul_f32(v374, v373); + float32x2_t v382 = vmul_f32(v381, v380); + float32x2_t v386 = vmul_f32(v344, v385); + float32x2_t v171 = vadd_f32(v170, v36); + float32x2_t v187 = vmul_f32(v170, v186); + float32x2_t v205 = vrev64_f32(v176); + float32x2_t v213 = vmul_f32(v212, v211); + float32x2_t v220 = vmul_f32(v219, v218); + float32x2_t v227 = vmul_f32(v226, v225); + float32x2_t v267 = vmul_f32(v255, v266); + float32x2_t v290 = vmul_f32(v289, v288); + float32x2_t v353 = vrev64_f32(v339); + float32x2_t v361 = vmul_f32(v360, v359); + float32x2_t v406 = vadd_f32(v386, v390); + float32x2_t v408 = vsub_f32(v386, v390); + float32x2_t v410 = vsub_f32(v386, v394); + float32x2_t v206 = vmul_f32(v205, v204); + float32x2_t v228 = vadd_f32(v171, v187); + float32x2_t v312 = vadd_f32(v267, v271); + float32x2_t v319 = vadd_f32(v290, v297); + float32x2_t v321 = vsub_f32(v290, v297); + float32x2_t v323 = vsub_f32(v290, v304); + float32x2_t v354 = vmul_f32(v353, v352); + float32x2_t v407 = vadd_f32(v406, v394); + float32x2_t v409 = vsub_f32(v408, v398); + float32x2_t v411 = vadd_f32(v410, v398); + float32x2_t v418 = vadd_f32(v171, v267); + int16x4_t v423 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v171, 15), (int32x2_t){0, 0})); + float32x2_t v229 = vadd_f32(v228, v191); + float32x2_t v231 = vsub_f32(v228, v191); + float32x2_t v233 = vsub_f32(v228, v195); + float32x2_t v235 = vadd_f32(v206, v213); + float32x2_t v237 = vsub_f32(v206, v213); + float32x2_t v239 = vsub_f32(v206, v220); + float32x2_t v313 = vadd_f32(v312, v275); + float32x2_t v315 = vsub_f32(v312, v275); + float32x2_t v317 = vsub_f32(v312, v279); + float32x2_t v320 = vadd_f32(v319, v304); + float32x2_t v322 = vsub_f32(v321, v311); + float32x2_t v324 = vadd_f32(v323, v311); + float32x2_t v399 = vadd_f32(v354, v361); + float32x2_t v419 = vadd_f32(v418, v354); + float32x2_t v420 = vsub_f32(v418, v354); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v423), 0); + float32x2_t v230 = vadd_f32(v229, v195); + float32x2_t v232 = vsub_f32(v231, v199); + float32x2_t v234 = vadd_f32(v233, v199); + float32x2_t v236 = vadd_f32(v235, v220); + float32x2_t v238 = vsub_f32(v237, v227); + float32x2_t v240 = vadd_f32(v239, v227); + float32x2_t v314 = vadd_f32(v313, v279); + float32x2_t v316 = vsub_f32(v315, v283); + float32x2_t v318 = vadd_f32(v317, v283); + float32x2_t v400 = vadd_f32(v399, v368); + float32x2_t v402 = vsub_f32(v399, v368); + float32x2_t v404 = vsub_f32(v399, v375); + int16x4_t v429 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v420, 15), (int32x2_t){0, 0})); + int16x4_t v435 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v419, 15), (int32x2_t){0, 0})); + float32x2_t v241 = vadd_f32(v230, v236); + float32x2_t v242 = vsub_f32(v230, v236); + float32x2_t v243 = vadd_f32(v232, v238); + float32x2_t v244 = vsub_f32(v232, v238); + float32x2_t v245 = vadd_f32(v234, v240); + float32x2_t v246 = vsub_f32(v234, v240); + float32x2_t v325 = vadd_f32(v314, v320); + float32x2_t v326 = vsub_f32(v314, v320); + float32x2_t v327 = vadd_f32(v316, v322); + float32x2_t v328 = vsub_f32(v316, v322); + float32x2_t v329 = vadd_f32(v318, v324); + float32x2_t v330 = vsub_f32(v318, v324); + float32x2_t v401 = vadd_f32(v400, v375); + float32x2_t v403 = vsub_f32(v402, v382); + float32x2_t v405 = vadd_f32(v404, v382); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v429), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v435), 0); + float32x2_t v412 = vadd_f32(v401, v407); + float32x2_t v413 = vsub_f32(v401, v407); + float32x2_t v414 = vadd_f32(v403, v409); + float32x2_t v415 = vsub_f32(v403, v409); + float32x2_t v416 = vadd_f32(v405, v411); + float32x2_t v417 = vsub_f32(v405, v411); + float32x2_t v439 = vadd_f32(v242, v326); + int16x4_t v444 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v242, 15), (int32x2_t){0, 0})); + float32x2_t v460 = vadd_f32(v244, v328); + int16x4_t v465 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v244, 15), (int32x2_t){0, 0})); + float32x2_t v481 = vadd_f32(v245, v329); + int16x4_t v486 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v245, 15), (int32x2_t){0, 0})); + float32x2_t v502 = vadd_f32(v246, v330); + int16x4_t v507 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v246, 15), (int32x2_t){0, 0})); + float32x2_t v523 = vadd_f32(v243, v327); + int16x4_t v528 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v243, 15), (int32x2_t){0, 0})); + float32x2_t v544 = vadd_f32(v241, v325); + int16x4_t v549 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v241, 15), (int32x2_t){0, 0})); + float32x2_t v440 = vadd_f32(v439, v413); + float32x2_t v441 = vsub_f32(v439, v413); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v444), 0); + float32x2_t v461 = vadd_f32(v460, v415); + float32x2_t v462 = vsub_f32(v460, v415); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v465), 0); + float32x2_t v482 = vadd_f32(v481, v416); + float32x2_t v483 = vsub_f32(v481, v416); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v486), 0); + float32x2_t v503 = vadd_f32(v502, v417); + float32x2_t v504 = vsub_f32(v502, v417); + v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v507), 0); + float32x2_t v524 = vadd_f32(v523, v414); + float32x2_t v525 = vsub_f32(v523, v414); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v528), 0); + float32x2_t v545 = vadd_f32(v544, v412); + float32x2_t v546 = vsub_f32(v544, v412); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v549), 0); + int16x4_t v450 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v441, 15), (int32x2_t){0, 0})); + int16x4_t v456 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v440, 15), (int32x2_t){0, 0})); + int16x4_t v471 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v462, 15), (int32x2_t){0, 0})); + int16x4_t v477 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v461, 15), (int32x2_t){0, 0})); + int16x4_t v492 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v483, 15), (int32x2_t){0, 0})); + int16x4_t v498 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v482, 15), (int32x2_t){0, 0})); + int16x4_t v513 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v504, 15), (int32x2_t){0, 0})); + int16x4_t v519 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v503, 15), (int32x2_t){0, 0})); + int16x4_t v534 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v525, 15), (int32x2_t){0, 0})); + int16x4_t v540 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v524, 15), (int32x2_t){0, 0})); + int16x4_t v555 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v546, 15), (int32x2_t){0, 0})); + int16x4_t v561 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v545, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v450), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v456), 0); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v471), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v477), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v492), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v498), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v513), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v519), 0); + v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v534), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v540), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v555), 0); + v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v561), 0); + v5 += 1 * 1; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu21(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * 1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v230 = -1.1666666666666665e+00F; + float v235 = 7.9015646852540022e-01F; + float v240 = 5.5854267289647742e-02F; + float v245 = 7.3430220123575241e-01F; + float v250 = -4.4095855184409838e-01F; + float v257 = -3.4087293062393137e-01F; + float v264 = 5.3396936033772524e-01F; + float v271 = -8.7484229096165667e-01F; + float v314 = -1.4999999999999998e+00F; + float v319 = 1.7499999999999996e+00F; + float v324 = -1.1852347027881001e+00F; + float v329 = -8.3781400934471603e-02F; + float v334 = -1.1014533018536286e+00F; + float v339 = 6.6143782776614746e-01F; + float v346 = 5.1130939593589697e-01F; + float v353 = -8.0095404050658769e-01F; + float v360 = 1.3122634364424848e+00F; + float v403 = -8.6602540378443871e-01F; + float v410 = 1.0103629710818451e+00F; + float v417 = -6.8429557470759583e-01F; + float v424 = -4.8371214382601155e-02F; + float v431 = -6.3592436032499466e-01F; + float v438 = -3.8188130791298663e-01F; + float v443 = -2.9520461738277515e-01F; + float v448 = 4.6243103089499693e-01F; + float v453 = -7.5763564827777208e-01F; + const int32_t *v807 = &v5[v0]; + int32_t *v925 = &v6[v2]; + int64_t v19 = v0 * 7; + int64_t v27 = v0 * 14; + int64_t v46 = v0 * 10; + int64_t v54 = v0 * 17; + int64_t v64 = v0 * 3; + int64_t v73 = v0 * 13; + int64_t v81 = v0 * 20; + int64_t v91 = v0 * 6; + int64_t v100 = v0 * 16; + int64_t v108 = v0 * 2; + int64_t v118 = v0 * 9; + int64_t v127 = v0 * 19; + int64_t v135 = v0 * 5; + int64_t v145 = v0 * 12; + int64_t v162 = v0 * 8; + int64_t v172 = v0 * 15; + int64_t v181 = v0 * 4; + int64_t v189 = v0 * 11; + int64_t v199 = v0 * 18; + float v253 = v4 * v250; + float v260 = v4 * v257; + float v267 = v4 * v264; + float v274 = v4 * v271; + float v342 = v4 * v339; + float v349 = v4 * v346; + float v356 = v4 * v353; + float v363 = v4 * v360; + float v406 = v4 * v403; + float v413 = v4 * v410; + float v420 = v4 * v417; + float v427 = v4 * v424; + float v434 = v4 * v431; + int64_t v488 = v2 * 7; + int64_t v496 = v2 * 14; + int64_t v507 = v2 * 15; + int64_t v523 = v2 * 8; + int64_t v534 = v2 * 9; + int64_t v542 = v2 * 16; + int64_t v550 = v2 * 2; + int64_t v561 = v2 * 3; + int64_t v569 = v2 * 10; + int64_t v577 = v2 * 17; + int64_t v588 = v2 * 18; + int64_t v596 = v2 * 4; + int64_t v604 = v2 * 11; + int64_t v615 = v2 * 12; + int64_t v623 = v2 * 19; + int64_t v631 = v2 * 5; + int64_t v642 = v2 * 6; + int64_t v650 = v2 * 13; + int64_t v658 = v2 * 20; + const int32_t *v690 = &v5[0]; + svfloat32_t v856 = svdup_n_f32(v230); + svfloat32_t v857 = svdup_n_f32(v235); + svfloat32_t v858 = svdup_n_f32(v240); + svfloat32_t v859 = svdup_n_f32(v245); + svfloat32_t v864 = svdup_n_f32(v314); + svfloat32_t v865 = svdup_n_f32(v319); + svfloat32_t v866 = svdup_n_f32(v324); + svfloat32_t v867 = svdup_n_f32(v329); + svfloat32_t v868 = svdup_n_f32(v334); + svfloat32_t v878 = svdup_n_f32(v438); + svfloat32_t v879 = svdup_n_f32(v443); + svfloat32_t v880 = svdup_n_f32(v448); + svfloat32_t v881 = svdup_n_f32(v453); + int32_t *v889 = &v6[0]; + svfloat32_t v160 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v807[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v671 = &v5[v19]; + const int32_t *v680 = &v5[v27]; + const int32_t *v699 = &v5[v46]; + const int32_t *v708 = &v5[v54]; + const int32_t *v717 = &v5[v64]; + const int32_t *v726 = &v5[v73]; + const int32_t *v735 = &v5[v81]; + const int32_t *v744 = &v5[v91]; + const int32_t *v753 = &v5[v100]; + const int32_t *v762 = &v5[v108]; + const int32_t *v771 = &v5[v118]; + const int32_t *v780 = &v5[v127]; + const int32_t *v789 = &v5[v135]; + const int32_t *v798 = &v5[v145]; + const int32_t *v816 = &v5[v162]; + const int32_t *v825 = &v5[v172]; + const int32_t *v834 = &v5[v181]; + const int32_t *v843 = &v5[v189]; + const int32_t *v852 = &v5[v199]; + svfloat32_t v860 = svdup_n_f32(v253); + svfloat32_t v861 = svdup_n_f32(v260); + svfloat32_t v862 = svdup_n_f32(v267); + svfloat32_t v863 = svdup_n_f32(v274); + svfloat32_t v869 = svdup_n_f32(v342); + svfloat32_t v870 = svdup_n_f32(v349); + svfloat32_t v871 = svdup_n_f32(v356); + svfloat32_t v872 = svdup_n_f32(v363); + svfloat32_t v873 = svdup_n_f32(v406); + svfloat32_t v874 = svdup_n_f32(v413); + svfloat32_t v875 = svdup_n_f32(v420); + svfloat32_t v876 = svdup_n_f32(v427); + svfloat32_t v877 = svdup_n_f32(v434); + int32_t *v898 = &v6[v488]; + int32_t *v907 = &v6[v496]; + int32_t *v916 = &v6[v507]; + int32_t *v934 = &v6[v523]; + int32_t *v943 = &v6[v534]; + int32_t *v952 = &v6[v542]; + int32_t *v961 = &v6[v550]; + int32_t *v970 = &v6[v561]; + int32_t *v979 = &v6[v569]; + int32_t *v988 = &v6[v577]; + int32_t *v997 = &v6[v588]; + int32_t *v1006 = &v6[v596]; + int32_t *v1015 = &v6[v604]; + int32_t *v1024 = &v6[v615]; + int32_t *v1033 = &v6[v623]; + int32_t *v1042 = &v6[v631]; + int32_t *v1051 = &v6[v642]; + int32_t *v1060 = &v6[v650]; + int32_t *v1069 = &v6[v658]; + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v690[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v671[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v680[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v52 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v699[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v60 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v708[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v70 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v717[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v79 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v726[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v87 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v735[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v97 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v744[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v106 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v753[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v114 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v762[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v124 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v771[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v133 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v780[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v141 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v789[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v151 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v798[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v168 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v816[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v178 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v825[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v187 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v834[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v195 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v843[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v205 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v852[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v61 = svadd_f32_x(svptrue_b32(), v52, v60); + svfloat32_t v62 = svsub_f32_x(svptrue_b32(), v52, v60); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v115 = svadd_f32_x(svptrue_b32(), v106, v114); + svfloat32_t v116 = svsub_f32_x(svptrue_b32(), v106, v114); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v169 = svadd_f32_x(svptrue_b32(), v160, v168); + svfloat32_t v170 = svsub_f32_x(svptrue_b32(), v160, v168); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v187, v195); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v187, v195); + svfloat32_t v44 = svadd_f32_x(svptrue_b32(), v34, v43); + svfloat32_t v71 = svadd_f32_x(svptrue_b32(), v61, v70); + svfloat32_t v98 = svadd_f32_x(svptrue_b32(), v88, v97); + svfloat32_t v125 = svadd_f32_x(svptrue_b32(), v115, v124); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v142, v151); + svfloat32_t v179 = svadd_f32_x(svptrue_b32(), v169, v178); + svfloat32_t v206 = svadd_f32_x(svptrue_b32(), v196, v205); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v61, v196); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v61, v196); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v142, v115); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v142, v115); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v88, v169); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v88, v169); + svfloat32_t v385 = svadd_f32_x(svptrue_b32(), v62, v197); + svfloat32_t v386 = svsub_f32_x(svptrue_b32(), v62, v197); + svfloat32_t v387 = svadd_f32_x(svptrue_b32(), v143, v116); + svfloat32_t v388 = svsub_f32_x(svptrue_b32(), v143, v116); + svfloat32_t v389 = svadd_f32_x(svptrue_b32(), v89, v170); + svfloat32_t v390 = svsub_f32_x(svptrue_b32(), v89, v170); + svfloat32_t v207 = svadd_f32_x(svptrue_b32(), v71, v206); + svfloat32_t v208 = svsub_f32_x(svptrue_b32(), v71, v206); + svfloat32_t v209 = svadd_f32_x(svptrue_b32(), v152, v125); + svfloat32_t v210 = svsub_f32_x(svptrue_b32(), v152, v125); + svfloat32_t v211 = svadd_f32_x(svptrue_b32(), v98, v179); + svfloat32_t v212 = svsub_f32_x(svptrue_b32(), v98, v179); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v296, v298); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v296, v298); + svfloat32_t v306 = svsub_f32_x(svptrue_b32(), v298, v300); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v300, v296); + svfloat32_t v308 = svadd_f32_x(svptrue_b32(), v297, v299); + svfloat32_t v310 = svsub_f32_x(svptrue_b32(), v297, v299); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v299, v301); + svfloat32_t v312 = svsub_f32_x(svptrue_b32(), v301, v297); + svfloat32_t v391 = svadd_f32_x(svptrue_b32(), v385, v387); + svfloat32_t v394 = svsub_f32_x(svptrue_b32(), v385, v387); + svfloat32_t v395 = svsub_f32_x(svptrue_b32(), v387, v389); + svfloat32_t v396 = svsub_f32_x(svptrue_b32(), v389, v385); + svfloat32_t v397 = svadd_f32_x(svptrue_b32(), v386, v388); + svfloat32_t v399 = svsub_f32_x(svptrue_b32(), v386, v388); + svfloat32_t v400 = svsub_f32_x(svptrue_b32(), v388, v390); + svfloat32_t v401 = svsub_f32_x(svptrue_b32(), v390, v386); + svfloat32_t v213 = svadd_f32_x(svptrue_b32(), v207, v209); + svfloat32_t v216 = svsub_f32_x(svptrue_b32(), v207, v209); + svfloat32_t v217 = svsub_f32_x(svptrue_b32(), v209, v211); + svfloat32_t v218 = svsub_f32_x(svptrue_b32(), v211, v207); + svfloat32_t v219 = svadd_f32_x(svptrue_b32(), v208, v210); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v208, v210); + svfloat32_t v222 = svsub_f32_x(svptrue_b32(), v210, v212); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v212, v208); + svfloat32_t v303 = svadd_f32_x(svptrue_b32(), v302, v300); + svfloat32_t v309 = svadd_f32_x(svptrue_b32(), v308, v301); + svfloat32_t zero351 = svdup_n_f32(0); + svfloat32_t v351 = svcmla_f32_x(pred_full, zero351, v870, v310, 90); + svfloat32_t zero358 = svdup_n_f32(0); + svfloat32_t v358 = svcmla_f32_x(pred_full, zero358, v871, v311, 90); + svfloat32_t zero365 = svdup_n_f32(0); + svfloat32_t v365 = svcmla_f32_x(pred_full, zero365, v872, v312, 90); + svfloat32_t v392 = svadd_f32_x(svptrue_b32(), v391, v389); + svfloat32_t v398 = svadd_f32_x(svptrue_b32(), v397, v390); + svfloat32_t zero422 = svdup_n_f32(0); + svfloat32_t v422 = svcmla_f32_x(pred_full, zero422, v875, v394, 90); + svfloat32_t zero429 = svdup_n_f32(0); + svfloat32_t v429 = svcmla_f32_x(pred_full, zero429, v876, v395, 90); + svfloat32_t zero436 = svdup_n_f32(0); + svfloat32_t v436 = svcmla_f32_x(pred_full, zero436, v877, v396, 90); + svfloat32_t v446 = svmul_f32_x(svptrue_b32(), v399, v879); + svfloat32_t v451 = svmul_f32_x(svptrue_b32(), v400, v880); + svfloat32_t v214 = svadd_f32_x(svptrue_b32(), v213, v211); + svfloat32_t v220 = svadd_f32_x(svptrue_b32(), v219, v212); + svfloat32_t zero262 = svdup_n_f32(0); + svfloat32_t v262 = svcmla_f32_x(pred_full, zero262, v861, v221, 90); + svfloat32_t zero269 = svdup_n_f32(0); + svfloat32_t v269 = svcmla_f32_x(pred_full, zero269, v862, v222, 90); + svfloat32_t zero276 = svdup_n_f32(0); + svfloat32_t v276 = svcmla_f32_x(pred_full, zero276, v863, v223, 90); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v303, v34); + svfloat32_t v322 = svmul_f32_x(svptrue_b32(), v303, v865); + svfloat32_t zero344 = svdup_n_f32(0); + svfloat32_t v344 = svcmla_f32_x(pred_full, zero344, v869, v309, 90); + svfloat32_t v393 = svadd_f32_x(svptrue_b32(), v392, v35); + svfloat32_t v215 = svadd_f32_x(svptrue_b32(), v214, v44); + svfloat32_t zero255 = svdup_n_f32(0); + svfloat32_t v255 = svcmla_f32_x(pred_full, zero255, v860, v220, 90); + svfloat32_t v373 = svadd_f32_x(svptrue_b32(), v344, v351); + svfloat32_t v375 = svsub_f32_x(svptrue_b32(), v344, v351); + svfloat32_t v377 = svsub_f32_x(svptrue_b32(), v344, v358); + svfloat32_t zero408 = svdup_n_f32(0); + svfloat32_t v408 = svcmla_f32_x(pred_full, zero408, v873, v393, 90); + svfloat32_t v464 = svmla_f32_x(pred_full, v446, v398, v878); + svfloat32_t v466 = svnmls_f32_x(pred_full, v446, v398, v878); + svfloat32_t v468 = svnmls_f32_x(pred_full, v451, v398, v878); + svfloat32_t v277 = svmla_f32_x(pred_full, v215, v214, v856); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v255, v262); + svfloat32_t v286 = svsub_f32_x(svptrue_b32(), v255, v262); + svfloat32_t v288 = svsub_f32_x(svptrue_b32(), v255, v269); + svfloat32_t v366 = svmla_f32_x(pred_full, v322, v304, v864); + svfloat32_t v374 = svadd_f32_x(svptrue_b32(), v373, v358); + svfloat32_t v376 = svsub_f32_x(svptrue_b32(), v375, v365); + svfloat32_t v378 = svadd_f32_x(svptrue_b32(), v377, v365); + svfloat32_t v457 = svcmla_f32_x(pred_full, v408, v874, v392, 90); + svfloat32_t v465 = svmla_f32_x(pred_full, v464, v400, v880); + svfloat32_t v467 = svmls_f32_x(pred_full, v466, v401, v881); + svfloat32_t v469 = svmla_f32_x(pred_full, v468, v401, v881); + svfloat32_t v476 = svmla_f32_x(pred_full, v215, v304, v864); + svint16_t v481 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v215, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v278 = svmla_f32_x(pred_full, v277, v216, v857); + svfloat32_t v280 = svmls_f32_x(pred_full, v277, v216, v857); + svfloat32_t v282 = svmls_f32_x(pred_full, v277, v217, v858); + svfloat32_t v285 = svadd_f32_x(svptrue_b32(), v284, v269); + svfloat32_t v287 = svsub_f32_x(svptrue_b32(), v286, v276); + svfloat32_t v289 = svadd_f32_x(svptrue_b32(), v288, v276); + svfloat32_t v367 = svmla_f32_x(pred_full, v366, v305, v866); + svfloat32_t v369 = svmls_f32_x(pred_full, v366, v305, v866); + svfloat32_t v371 = svmls_f32_x(pred_full, v366, v306, v867); + svfloat32_t v458 = svadd_f32_x(svptrue_b32(), v457, v422); + svfloat32_t v460 = svsub_f32_x(svptrue_b32(), v457, v422); + svfloat32_t v462 = svsub_f32_x(svptrue_b32(), v457, v429); + svfloat32_t v477 = svadd_f32_x(svptrue_b32(), v476, v408); + svfloat32_t v478 = svsub_f32_x(svptrue_b32(), v476, v408); + svst1w_u64(pred_full, (unsigned *)(v889), svreinterpret_u64_s16(v481)); + svfloat32_t v279 = svmla_f32_x(pred_full, v278, v217, v858); + svfloat32_t v281 = svmls_f32_x(pred_full, v280, v218, v859); + svfloat32_t v283 = svmla_f32_x(pred_full, v282, v218, v859); + svfloat32_t v368 = svmla_f32_x(pred_full, v367, v306, v867); + svfloat32_t v370 = svmls_f32_x(pred_full, v369, v307, v868); + svfloat32_t v372 = svmla_f32_x(pred_full, v371, v307, v868); + svfloat32_t v459 = svadd_f32_x(svptrue_b32(), v458, v429); + svfloat32_t v461 = svsub_f32_x(svptrue_b32(), v460, v436); + svfloat32_t v463 = svadd_f32_x(svptrue_b32(), v462, v436); + svint16_t v489 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v478, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v497 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v477, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v290 = svadd_f32_x(svptrue_b32(), v279, v285); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v279, v285); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v281, v287); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v281, v287); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v283, v289); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v283, v289); + svfloat32_t v379 = svadd_f32_x(svptrue_b32(), v368, v374); + svfloat32_t v380 = svsub_f32_x(svptrue_b32(), v368, v374); + svfloat32_t v381 = svadd_f32_x(svptrue_b32(), v370, v376); + svfloat32_t v382 = svsub_f32_x(svptrue_b32(), v370, v376); + svfloat32_t v383 = svadd_f32_x(svptrue_b32(), v372, v378); + svfloat32_t v384 = svsub_f32_x(svptrue_b32(), v372, v378); + svfloat32_t v470 = svadd_f32_x(svptrue_b32(), v459, v465); + svfloat32_t v471 = svsub_f32_x(svptrue_b32(), v459, v465); + svfloat32_t v472 = svadd_f32_x(svptrue_b32(), v461, v467); + svfloat32_t v473 = svsub_f32_x(svptrue_b32(), v461, v467); + svfloat32_t v474 = svadd_f32_x(svptrue_b32(), v463, v469); + svfloat32_t v475 = svsub_f32_x(svptrue_b32(), v463, v469); + svst1w_u64(pred_full, (unsigned *)(v898), svreinterpret_u64_s16(v489)); + svst1w_u64(pred_full, (unsigned *)(v907), svreinterpret_u64_s16(v497)); + svfloat32_t v503 = svadd_f32_x(svptrue_b32(), v291, v380); + svint16_t v508 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v291, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v530 = svadd_f32_x(svptrue_b32(), v293, v382); + svint16_t v535 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v293, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v557 = svadd_f32_x(svptrue_b32(), v294, v383); + svint16_t v562 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v294, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v584 = svadd_f32_x(svptrue_b32(), v295, v384); + svint16_t v589 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v295, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v611 = svadd_f32_x(svptrue_b32(), v292, v381); + svint16_t v616 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v292, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v638 = svadd_f32_x(svptrue_b32(), v290, v379); + svint16_t v643 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v290, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v504 = svadd_f32_x(svptrue_b32(), v503, v471); + svfloat32_t v505 = svsub_f32_x(svptrue_b32(), v503, v471); + svfloat32_t v531 = svadd_f32_x(svptrue_b32(), v530, v473); + svfloat32_t v532 = svsub_f32_x(svptrue_b32(), v530, v473); + svfloat32_t v558 = svadd_f32_x(svptrue_b32(), v557, v474); + svfloat32_t v559 = svsub_f32_x(svptrue_b32(), v557, v474); + svfloat32_t v585 = svadd_f32_x(svptrue_b32(), v584, v475); + svfloat32_t v586 = svsub_f32_x(svptrue_b32(), v584, v475); + svfloat32_t v612 = svadd_f32_x(svptrue_b32(), v611, v472); + svfloat32_t v613 = svsub_f32_x(svptrue_b32(), v611, v472); + svfloat32_t v639 = svadd_f32_x(svptrue_b32(), v638, v470); + svfloat32_t v640 = svsub_f32_x(svptrue_b32(), v638, v470); + svst1w_u64(pred_full, (unsigned *)(v916), svreinterpret_u64_s16(v508)); + svst1w_u64(pred_full, (unsigned *)(v943), svreinterpret_u64_s16(v535)); + svst1w_u64(pred_full, (unsigned *)(v970), svreinterpret_u64_s16(v562)); + svst1w_u64(pred_full, (unsigned *)(v997), svreinterpret_u64_s16(v589)); + svst1w_u64(pred_full, (unsigned *)(v1024), svreinterpret_u64_s16(v616)); + svst1w_u64(pred_full, (unsigned *)(v1051), svreinterpret_u64_s16(v643)); + svint16_t v516 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v505, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v524 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v504, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v543 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v532, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v551 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v531, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v570 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v559, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v578 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v558, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v597 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v586, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v605 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v585, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v624 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v613, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v632 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v612, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v651 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v640, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v659 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v639, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v925), svreinterpret_u64_s16(v516)); + svst1w_u64(pred_full, (unsigned *)(v934), svreinterpret_u64_s16(v524)); + svst1w_u64(pred_full, (unsigned *)(v952), svreinterpret_u64_s16(v543)); + svst1w_u64(pred_full, (unsigned *)(v961), svreinterpret_u64_s16(v551)); + svst1w_u64(pred_full, (unsigned *)(v979), svreinterpret_u64_s16(v570)); + svst1w_u64(pred_full, (unsigned *)(v988), svreinterpret_u64_s16(v578)); + svst1w_u64(pred_full, (unsigned *)(v1006), svreinterpret_u64_s16(v597)); + svst1w_u64(pred_full, (unsigned *)(v1015), svreinterpret_u64_s16(v605)); + svst1w_u64(pred_full, (unsigned *)(v1033), svreinterpret_u64_s16(v624)); + svst1w_u64(pred_full, (unsigned *)(v1042), svreinterpret_u64_s16(v632)); + svst1w_u64(pred_full, (unsigned *)(v1060), svreinterpret_u64_s16(v651)); + svst1w_u64(pred_full, (unsigned *)(v1069), svreinterpret_u64_s16(v659)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu22(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v110 = vld1s_s16(&v5[istride]); + float v410 = 1.1000000000000001e+00F; + float v413 = 3.3166247903554003e-01F; + float v414 = -3.3166247903554003e-01F; + float v421 = 5.1541501300188641e-01F; + float v425 = 9.4125353283118118e-01F; + float v429 = 1.4143537075597825e+00F; + float v433 = 8.5949297361449750e-01F; + float v437 = 4.2314838273285138e-02F; + float v441 = 3.8639279888589606e-01F; + float v445 = 5.1254589567200015e-01F; + float v449 = 1.0702757469471715e+00F; + float v453 = 5.5486073394528512e-01F; + float v456 = 1.2412944743900585e+00F; + float v457 = -1.2412944743900585e+00F; + float v463 = 2.0897833842005756e-01F; + float v464 = -2.0897833842005756e-01F; + float v470 = 3.7415717312460811e-01F; + float v471 = -3.7415717312460811e-01F; + float v477 = 4.9929922194110327e-02F; + float v478 = -4.9929922194110327e-02F; + float v484 = 6.5815896284539266e-01F; + float v485 = -6.5815896284539266e-01F; + float v491 = 6.3306543373877577e-01F; + float v492 = -6.3306543373877577e-01F; + float v498 = 1.0822460581641109e+00F; + float v499 = -1.0822460581641109e+00F; + float v505 = 8.1720737907134022e-01F; + float v506 = -8.1720737907134022e-01F; + float v512 = 4.2408709531871824e-01F; + float v513 = -4.2408709531871824e-01F; + float32x2_t v515 = (float32x2_t){v4, v4}; + int16x4_t v20 = vld1s_s16(&v5[0]); + float32x2_t v111 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v110)), 15); + float32x2_t v411 = (float32x2_t){v410, v410}; + float32x2_t v415 = (float32x2_t){v413, v414}; + float32x2_t v422 = (float32x2_t){v421, v421}; + float32x2_t v426 = (float32x2_t){v425, v425}; + float32x2_t v430 = (float32x2_t){v429, v429}; + float32x2_t v434 = (float32x2_t){v433, v433}; + float32x2_t v438 = (float32x2_t){v437, v437}; + float32x2_t v442 = (float32x2_t){v441, v441}; + float32x2_t v446 = (float32x2_t){v445, v445}; + float32x2_t v450 = (float32x2_t){v449, v449}; + float32x2_t v454 = (float32x2_t){v453, v453}; + float32x2_t v458 = (float32x2_t){v456, v457}; + float32x2_t v465 = (float32x2_t){v463, v464}; + float32x2_t v472 = (float32x2_t){v470, v471}; + float32x2_t v479 = (float32x2_t){v477, v478}; + float32x2_t v486 = (float32x2_t){v484, v485}; + float32x2_t v493 = (float32x2_t){v491, v492}; + float32x2_t v500 = (float32x2_t){v498, v499}; + float32x2_t v507 = (float32x2_t){v505, v506}; + float32x2_t v514 = (float32x2_t){v512, v513}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v26 = vld1s_s16(&v5[istride * 11]); + int16x4_t v34 = vld1s_s16(&v5[istride * 2]); + int16x4_t v40 = vld1s_s16(&v5[istride * 13]); + int16x4_t v48 = vld1s_s16(&v5[istride * 4]); + int16x4_t v54 = vld1s_s16(&v5[istride * 15]); + int16x4_t v62 = vld1s_s16(&v5[istride * 6]); + int16x4_t v68 = vld1s_s16(&v5[istride * 17]); + int16x4_t v76 = vld1s_s16(&v5[istride * 8]); + int16x4_t v82 = vld1s_s16(&v5[istride * 19]); + int16x4_t v90 = vld1s_s16(&v5[istride * 10]); + int16x4_t v96 = vld1s_s16(&v5[istride * 21]); + int16x4_t v104 = vld1s_s16(&v5[istride * 12]); + int16x4_t v118 = vld1s_s16(&v5[istride * 14]); + int16x4_t v124 = vld1s_s16(&v5[istride * 3]); + int16x4_t v132 = vld1s_s16(&v5[istride * 16]); + int16x4_t v138 = vld1s_s16(&v5[istride * 5]); + int16x4_t v146 = vld1s_s16(&v5[istride * 18]); + int16x4_t v152 = vld1s_s16(&v5[istride * 7]); + int16x4_t v160 = vld1s_s16(&v5[istride * 20]); + int16x4_t v166 = vld1s_s16(&v5[istride * 9]); + float32x2_t v417 = vmul_f32(v515, v415); + float32x2_t v460 = vmul_f32(v515, v458); + float32x2_t v467 = vmul_f32(v515, v465); + float32x2_t v474 = vmul_f32(v515, v472); + float32x2_t v481 = vmul_f32(v515, v479); + float32x2_t v488 = vmul_f32(v515, v486); + float32x2_t v495 = vmul_f32(v515, v493); + float32x2_t v502 = vmul_f32(v515, v500); + float32x2_t v509 = vmul_f32(v515, v507); + float32x2_t v516 = vmul_f32(v515, v514); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v49 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v48)), 15); + float32x2_t v55 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v54)), 15); + float32x2_t v63 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v62)), 15); + float32x2_t v69 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v68)), 15); + float32x2_t v77 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v76)), 15); + float32x2_t v83 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v82)), 15); + float32x2_t v91 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v90)), 15); + float32x2_t v97 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v96)), 15); + float32x2_t v105 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v104)), 15); + float32x2_t v119 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v118)), 15); + float32x2_t v125 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v124)), 15); + float32x2_t v133 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v132)), 15); + float32x2_t v139 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v138)), 15); + float32x2_t v147 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v146)), 15); + float32x2_t v153 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v152)), 15); + float32x2_t v161 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v160)), 15); + float32x2_t v167 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v166)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v56 = vadd_f32(v49, v55); + float32x2_t v57 = vsub_f32(v49, v55); + float32x2_t v70 = vadd_f32(v63, v69); + float32x2_t v71 = vsub_f32(v63, v69); + float32x2_t v84 = vadd_f32(v77, v83); + float32x2_t v85 = vsub_f32(v77, v83); + float32x2_t v98 = vadd_f32(v91, v97); + float32x2_t v99 = vsub_f32(v91, v97); + float32x2_t v112 = vadd_f32(v105, v111); + float32x2_t v113 = vsub_f32(v105, v111); + float32x2_t v126 = vadd_f32(v119, v125); + float32x2_t v127 = vsub_f32(v119, v125); + float32x2_t v140 = vadd_f32(v133, v139); + float32x2_t v141 = vsub_f32(v133, v139); + float32x2_t v154 = vadd_f32(v147, v153); + float32x2_t v155 = vsub_f32(v147, v153); + float32x2_t v168 = vadd_f32(v161, v167); + float32x2_t v169 = vsub_f32(v161, v167); + float32x2_t v170 = vadd_f32(v42, v168); + float32x2_t v171 = vadd_f32(v56, v154); + float32x2_t v172 = vadd_f32(v70, v140); + float32x2_t v173 = vadd_f32(v84, v126); + float32x2_t v174 = vadd_f32(v98, v112); + float32x2_t v175 = vsub_f32(v42, v168); + float32x2_t v176 = vsub_f32(v56, v154); + float32x2_t v177 = vsub_f32(v70, v140); + float32x2_t v178 = vsub_f32(v84, v126); + float32x2_t v179 = vsub_f32(v98, v112); + float32x2_t v368 = vadd_f32(v43, v169); + float32x2_t v369 = vadd_f32(v57, v155); + float32x2_t v370 = vadd_f32(v71, v141); + float32x2_t v371 = vadd_f32(v85, v127); + float32x2_t v372 = vadd_f32(v99, v113); + float32x2_t v373 = vsub_f32(v43, v169); + float32x2_t v374 = vsub_f32(v57, v155); + float32x2_t v375 = vsub_f32(v71, v141); + float32x2_t v376 = vsub_f32(v85, v127); + float32x2_t v377 = vsub_f32(v99, v113); + float32x2_t v180 = vadd_f32(v170, v171); + float32x2_t v181 = vadd_f32(v172, v174); + float32x2_t v183 = vsub_f32(v176, v177); + float32x2_t v184 = vadd_f32(v175, v179); + float32x2_t v189 = vsub_f32(v171, v173); + float32x2_t v190 = vsub_f32(v170, v173); + float32x2_t v191 = vsub_f32(v171, v170); + float32x2_t v192 = vsub_f32(v174, v173); + float32x2_t v193 = vsub_f32(v172, v173); + float32x2_t v194 = vsub_f32(v174, v172); + float32x2_t v195 = vsub_f32(v171, v174); + float32x2_t v196 = vsub_f32(v170, v172); + float32x2_t v198 = vadd_f32(v176, v178); + float32x2_t v199 = vsub_f32(v175, v178); + float32x2_t v200 = vadd_f32(v175, v176); + float32x2_t v201 = vsub_f32(v178, v179); + float32x2_t v202 = vsub_f32(v177, v178); + float32x2_t v203 = vsub_f32(v177, v179); + float32x2_t v204 = vadd_f32(v176, v179); + float32x2_t v205 = vsub_f32(v175, v177); + float32x2_t v378 = vadd_f32(v368, v369); + float32x2_t v379 = vadd_f32(v370, v372); + float32x2_t v381 = vsub_f32(v374, v375); + float32x2_t v382 = vadd_f32(v373, v377); + float32x2_t v387 = vsub_f32(v369, v371); + float32x2_t v388 = vsub_f32(v368, v371); + float32x2_t v389 = vsub_f32(v369, v368); + float32x2_t v390 = vsub_f32(v372, v371); + float32x2_t v391 = vsub_f32(v370, v371); + float32x2_t v392 = vsub_f32(v372, v370); + float32x2_t v393 = vsub_f32(v369, v372); + float32x2_t v394 = vsub_f32(v368, v370); + float32x2_t v396 = vadd_f32(v374, v376); + float32x2_t v397 = vsub_f32(v373, v376); + float32x2_t v398 = vadd_f32(v373, v374); + float32x2_t v399 = vsub_f32(v376, v377); + float32x2_t v400 = vsub_f32(v375, v376); + float32x2_t v401 = vsub_f32(v375, v377); + float32x2_t v402 = vadd_f32(v374, v377); + float32x2_t v403 = vsub_f32(v373, v375); + float32x2_t v182 = vadd_f32(v173, v180); + float32x2_t v187 = vsub_f32(v183, v184); + float32x2_t v197 = vsub_f32(v181, v180); + float32x2_t v206 = vadd_f32(v183, v184); + float32x2_t v225 = vmul_f32(v189, v422); + float32x2_t v229 = vmul_f32(v190, v426); + float32x2_t v233 = vmul_f32(v191, v430); + float32x2_t v237 = vmul_f32(v192, v434); + float32x2_t v241 = vmul_f32(v193, v438); + float32x2_t v245 = vmul_f32(v194, v442); + float32x2_t v249 = vmul_f32(v195, v446); + float32x2_t v253 = vmul_f32(v196, v450); + float32x2_t v263 = vrev64_f32(v198); + float32x2_t v270 = vrev64_f32(v199); + float32x2_t v277 = vrev64_f32(v200); + float32x2_t v284 = vrev64_f32(v201); + float32x2_t v291 = vrev64_f32(v202); + float32x2_t v298 = vrev64_f32(v203); + float32x2_t v305 = vrev64_f32(v204); + float32x2_t v312 = vrev64_f32(v205); + float32x2_t v380 = vadd_f32(v371, v378); + float32x2_t v385 = vsub_f32(v381, v382); + float32x2_t v395 = vsub_f32(v379, v378); + float32x2_t v404 = vadd_f32(v381, v382); + float32x2_t v423 = vmul_f32(v387, v422); + float32x2_t v427 = vmul_f32(v388, v426); + float32x2_t v431 = vmul_f32(v389, v430); + float32x2_t v435 = vmul_f32(v390, v434); + float32x2_t v439 = vmul_f32(v391, v438); + float32x2_t v443 = vmul_f32(v392, v442); + float32x2_t v447 = vmul_f32(v393, v446); + float32x2_t v451 = vmul_f32(v394, v450); + float32x2_t v461 = vrev64_f32(v396); + float32x2_t v468 = vrev64_f32(v397); + float32x2_t v475 = vrev64_f32(v398); + float32x2_t v482 = vrev64_f32(v399); + float32x2_t v489 = vrev64_f32(v400); + float32x2_t v496 = vrev64_f32(v401); + float32x2_t v503 = vrev64_f32(v402); + float32x2_t v510 = vrev64_f32(v403); + float32x2_t v185 = vadd_f32(v182, v181); + float32x2_t v188 = vsub_f32(v187, v178); + float32x2_t v257 = vmul_f32(v197, v454); + float32x2_t v264 = vmul_f32(v263, v460); + float32x2_t v271 = vmul_f32(v270, v467); + float32x2_t v278 = vmul_f32(v277, v474); + float32x2_t v285 = vmul_f32(v284, v481); + float32x2_t v292 = vmul_f32(v291, v488); + float32x2_t v299 = vmul_f32(v298, v495); + float32x2_t v306 = vmul_f32(v305, v502); + float32x2_t v313 = vmul_f32(v312, v509); + float32x2_t v319 = vrev64_f32(v206); + float32x2_t v322 = vadd_f32(v225, v229); + float32x2_t v323 = vadd_f32(v229, v233); + float32x2_t v324 = vsub_f32(v225, v233); + float32x2_t v325 = vadd_f32(v237, v241); + float32x2_t v326 = vadd_f32(v241, v245); + float32x2_t v327 = vsub_f32(v237, v245); + float32x2_t v383 = vadd_f32(v380, v379); + float32x2_t v386 = vsub_f32(v385, v376); + float32x2_t v455 = vmul_f32(v395, v454); + float32x2_t v462 = vmul_f32(v461, v460); + float32x2_t v469 = vmul_f32(v468, v467); + float32x2_t v476 = vmul_f32(v475, v474); + float32x2_t v483 = vmul_f32(v482, v481); + float32x2_t v490 = vmul_f32(v489, v488); + float32x2_t v497 = vmul_f32(v496, v495); + float32x2_t v504 = vmul_f32(v503, v502); + float32x2_t v511 = vmul_f32(v510, v509); + float32x2_t v517 = vrev64_f32(v404); + float32x2_t v520 = vadd_f32(v423, v427); + float32x2_t v521 = vadd_f32(v427, v431); + float32x2_t v522 = vsub_f32(v423, v431); + float32x2_t v523 = vadd_f32(v435, v439); + float32x2_t v524 = vadd_f32(v439, v443); + float32x2_t v525 = vsub_f32(v435, v443); + float32x2_t v186 = vadd_f32(v28, v185); + float32x2_t v214 = vmul_f32(v185, v411); + float32x2_t v220 = vrev64_f32(v188); + float32x2_t v320 = vmul_f32(v319, v516); + float32x2_t v328 = vadd_f32(v253, v257); + float32x2_t v329 = vadd_f32(v249, v257); + float32x2_t v330 = vadd_f32(v271, v278); + float32x2_t v331 = vsub_f32(v264, v278); + float32x2_t v332 = vadd_f32(v292, v299); + float32x2_t v333 = vsub_f32(v285, v299); + float32x2_t v384 = vadd_f32(v29, v383); + float32x2_t v412 = vmul_f32(v383, v411); + float32x2_t v418 = vrev64_f32(v386); + float32x2_t v518 = vmul_f32(v517, v516); + float32x2_t v526 = vadd_f32(v451, v455); + float32x2_t v527 = vadd_f32(v447, v455); + float32x2_t v528 = vadd_f32(v469, v476); + float32x2_t v529 = vsub_f32(v462, v476); + float32x2_t v530 = vadd_f32(v490, v497); + float32x2_t v531 = vsub_f32(v483, v497); + float32x2_t v221 = vmul_f32(v220, v417); + float32x2_t v321 = vsub_f32(v186, v214); + float32x2_t v334 = vadd_f32(v313, v320); + float32x2_t v335 = vsub_f32(v306, v320); + float32x2_t v336 = vadd_f32(v326, v328); + float32x2_t v354 = vadd_f32(v330, v331); + float32x2_t v419 = vmul_f32(v418, v417); + float32x2_t v519 = vsub_f32(v384, v412); + float32x2_t v532 = vadd_f32(v511, v518); + float32x2_t v533 = vsub_f32(v504, v518); + float32x2_t v534 = vadd_f32(v524, v526); + float32x2_t v552 = vadd_f32(v528, v529); + int16x4_t v568 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v186, 15), (int32x2_t){0, 0})); + int16x4_t v574 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v384, 15), (int32x2_t){0, 0})); + float32x2_t v337 = vadd_f32(v336, v321); + float32x2_t v338 = vsub_f32(v321, v323); + float32x2_t v340 = vadd_f32(v321, v327); + float32x2_t v342 = vsub_f32(v321, v324); + float32x2_t v344 = vadd_f32(v321, v322); + float32x2_t v346 = vadd_f32(v221, v332); + float32x2_t v348 = vsub_f32(v334, v330); + float32x2_t v350 = vadd_f32(v221, v335); + float32x2_t v352 = vsub_f32(v335, v331); + float32x2_t v355 = vadd_f32(v354, v332); + float32x2_t v535 = vadd_f32(v534, v519); + float32x2_t v536 = vsub_f32(v519, v521); + float32x2_t v538 = vadd_f32(v519, v525); + float32x2_t v540 = vsub_f32(v519, v522); + float32x2_t v542 = vadd_f32(v519, v520); + float32x2_t v544 = vadd_f32(v419, v530); + float32x2_t v546 = vsub_f32(v532, v528); + float32x2_t v548 = vadd_f32(v419, v533); + float32x2_t v550 = vsub_f32(v533, v529); + float32x2_t v553 = vadd_f32(v552, v530); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v568), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v574), 0); + float32x2_t v339 = vsub_f32(v338, v328); + float32x2_t v341 = vadd_f32(v340, v329); + float32x2_t v343 = vsub_f32(v342, v329); + float32x2_t v345 = vsub_f32(v344, v325); + float32x2_t v347 = vadd_f32(v346, v334); + float32x2_t v349 = vsub_f32(v348, v221); + float32x2_t v351 = vadd_f32(v350, v333); + float32x2_t v353 = vsub_f32(v352, v221); + float32x2_t v356 = vadd_f32(v355, v333); + float32x2_t v537 = vsub_f32(v536, v526); + float32x2_t v539 = vadd_f32(v538, v527); + float32x2_t v541 = vsub_f32(v540, v527); + float32x2_t v543 = vsub_f32(v542, v523); + float32x2_t v545 = vadd_f32(v544, v532); + float32x2_t v547 = vsub_f32(v546, v419); + float32x2_t v549 = vadd_f32(v548, v531); + float32x2_t v551 = vsub_f32(v550, v419); + float32x2_t v554 = vadd_f32(v553, v531); + float32x2_t v357 = vsub_f32(v356, v221); + float32x2_t v359 = vadd_f32(v337, v347); + float32x2_t v360 = vadd_f32(v339, v349); + float32x2_t v361 = vsub_f32(v341, v351); + float32x2_t v362 = vadd_f32(v343, v353); + float32x2_t v363 = vsub_f32(v343, v353); + float32x2_t v364 = vadd_f32(v341, v351); + float32x2_t v365 = vsub_f32(v339, v349); + float32x2_t v366 = vsub_f32(v337, v347); + float32x2_t v555 = vsub_f32(v554, v419); + float32x2_t v557 = vadd_f32(v535, v545); + float32x2_t v558 = vadd_f32(v537, v547); + float32x2_t v559 = vsub_f32(v539, v549); + float32x2_t v560 = vadd_f32(v541, v551); + float32x2_t v561 = vsub_f32(v541, v551); + float32x2_t v562 = vadd_f32(v539, v549); + float32x2_t v563 = vsub_f32(v537, v547); + float32x2_t v564 = vsub_f32(v535, v545); + float32x2_t v358 = vadd_f32(v345, v357); + float32x2_t v367 = vsub_f32(v345, v357); + float32x2_t v556 = vadd_f32(v543, v555); + float32x2_t v565 = vsub_f32(v543, v555); + int16x4_t v592 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v366, 15), (int32x2_t){0, 0})); + int16x4_t v598 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v564, 15), (int32x2_t){0, 0})); + int16x4_t v604 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v365, 15), (int32x2_t){0, 0})); + int16x4_t v610 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v563, 15), (int32x2_t){0, 0})); + int16x4_t v616 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v364, 15), (int32x2_t){0, 0})); + int16x4_t v622 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v562, 15), (int32x2_t){0, 0})); + int16x4_t v628 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v363, 15), (int32x2_t){0, 0})); + int16x4_t v634 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v561, 15), (int32x2_t){0, 0})); + int16x4_t v640 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v362, 15), (int32x2_t){0, 0})); + int16x4_t v646 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v560, 15), (int32x2_t){0, 0})); + int16x4_t v652 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v361, 15), (int32x2_t){0, 0})); + int16x4_t v658 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v559, 15), (int32x2_t){0, 0})); + int16x4_t v664 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v360, 15), (int32x2_t){0, 0})); + int16x4_t v670 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v558, 15), (int32x2_t){0, 0})); + int16x4_t v676 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v359, 15), (int32x2_t){0, 0})); + int16x4_t v682 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v557, 15), (int32x2_t){0, 0})); + int16x4_t v580 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v367, 15), (int32x2_t){0, 0})); + int16x4_t v586 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v565, 15), (int32x2_t){0, 0})); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v592), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v598), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v604), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v610), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v616), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v622), 0); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v628), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v634), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v640), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v646), 0); + v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v652), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v658), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v664), 0); + v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v670), 0); + v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v676), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v682), 0); + int16x4_t v688 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v358, 15), (int32x2_t){0, 0})); + int16x4_t v694 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v556, 15), (int32x2_t){0, 0})); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v580), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v586), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v688), 0); + v6[ostride * 21] = vget_lane_s32(vreinterpret_s32_s16(v694), 0); + v5 += 1 * 1; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu22(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * 1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v468 = 1.1000000000000001e+00F; + float v473 = -3.3166247903554003e-01F; + float v480 = 5.1541501300188641e-01F; + float v485 = 9.4125353283118118e-01F; + float v490 = 1.4143537075597825e+00F; + float v495 = 8.5949297361449750e-01F; + float v500 = 4.2314838273285138e-02F; + float v505 = 3.8639279888589606e-01F; + float v510 = 5.1254589567200015e-01F; + float v515 = 1.0702757469471715e+00F; + float v520 = 5.5486073394528512e-01F; + float v525 = -1.2412944743900585e+00F; + float v532 = -2.0897833842005756e-01F; + float v539 = -3.7415717312460811e-01F; + float v546 = -4.9929922194110327e-02F; + float v553 = -6.5815896284539266e-01F; + float v560 = -6.3306543373877577e-01F; + float v567 = -1.0822460581641109e+00F; + float v574 = -8.1720737907134022e-01F; + float v581 = -4.2408709531871824e-01F; + const int32_t *v934 = &v5[v0]; + int32_t *v1085 = &v6[v2]; + int64_t v27 = v0 * 11; + int64_t v37 = v0 * 2; + int64_t v45 = v0 * 13; + int64_t v55 = v0 * 4; + int64_t v63 = v0 * 15; + int64_t v73 = v0 * 6; + int64_t v81 = v0 * 17; + int64_t v91 = v0 * 8; + int64_t v99 = v0 * 19; + int64_t v109 = v0 * 10; + int64_t v117 = v0 * 21; + int64_t v127 = v0 * 12; + int64_t v145 = v0 * 14; + int64_t v153 = v0 * 3; + int64_t v163 = v0 * 16; + int64_t v171 = v0 * 5; + int64_t v181 = v0 * 18; + int64_t v189 = v0 * 7; + int64_t v199 = v0 * 20; + int64_t v207 = v0 * 9; + float v476 = v4 * v473; + float v528 = v4 * v525; + float v535 = v4 * v532; + float v542 = v4 * v539; + float v549 = v4 * v546; + float v556 = v4 * v553; + float v563 = v4 * v560; + float v570 = v4 * v567; + float v577 = v4 * v574; + float v584 = v4 * v581; + int64_t v643 = v2 * 11; + int64_t v651 = v2 * 12; + int64_t v667 = v2 * 2; + int64_t v675 = v2 * 13; + int64_t v683 = v2 * 14; + int64_t v691 = v2 * 3; + int64_t v699 = v2 * 4; + int64_t v707 = v2 * 15; + int64_t v715 = v2 * 16; + int64_t v723 = v2 * 5; + int64_t v731 = v2 * 6; + int64_t v739 = v2 * 17; + int64_t v747 = v2 * 18; + int64_t v755 = v2 * 7; + int64_t v763 = v2 * 8; + int64_t v771 = v2 * 19; + int64_t v779 = v2 * 20; + int64_t v787 = v2 * 9; + int64_t v795 = v2 * 10; + int64_t v803 = v2 * 21; + const int32_t *v817 = &v5[0]; + svfloat32_t v1031 = svdup_n_f32(v468); + svfloat32_t v1033 = svdup_n_f32(v480); + svfloat32_t v1034 = svdup_n_f32(v485); + svfloat32_t v1035 = svdup_n_f32(v490); + svfloat32_t v1036 = svdup_n_f32(v495); + svfloat32_t v1037 = svdup_n_f32(v500); + svfloat32_t v1038 = svdup_n_f32(v505); + svfloat32_t v1039 = svdup_n_f32(v510); + svfloat32_t v1040 = svdup_n_f32(v515); + svfloat32_t v1041 = svdup_n_f32(v520); + int32_t *v1058 = &v6[0]; + svfloat32_t v141 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v934[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v826 = &v5[v27]; + const int32_t *v835 = &v5[v37]; + const int32_t *v844 = &v5[v45]; + const int32_t *v853 = &v5[v55]; + const int32_t *v862 = &v5[v63]; + const int32_t *v871 = &v5[v73]; + const int32_t *v880 = &v5[v81]; + const int32_t *v889 = &v5[v91]; + const int32_t *v898 = &v5[v99]; + const int32_t *v907 = &v5[v109]; + const int32_t *v916 = &v5[v117]; + const int32_t *v925 = &v5[v127]; + const int32_t *v943 = &v5[v145]; + const int32_t *v952 = &v5[v153]; + const int32_t *v961 = &v5[v163]; + const int32_t *v970 = &v5[v171]; + const int32_t *v979 = &v5[v181]; + const int32_t *v988 = &v5[v189]; + const int32_t *v997 = &v5[v199]; + const int32_t *v1006 = &v5[v207]; + svfloat32_t v1032 = svdup_n_f32(v476); + svfloat32_t v1042 = svdup_n_f32(v528); + svfloat32_t v1043 = svdup_n_f32(v535); + svfloat32_t v1044 = svdup_n_f32(v542); + svfloat32_t v1045 = svdup_n_f32(v549); + svfloat32_t v1046 = svdup_n_f32(v556); + svfloat32_t v1047 = svdup_n_f32(v563); + svfloat32_t v1048 = svdup_n_f32(v570); + svfloat32_t v1049 = svdup_n_f32(v577); + svfloat32_t v1050 = svdup_n_f32(v584); + int32_t *v1067 = &v6[v643]; + int32_t *v1076 = &v6[v651]; + int32_t *v1094 = &v6[v667]; + int32_t *v1103 = &v6[v675]; + int32_t *v1112 = &v6[v683]; + int32_t *v1121 = &v6[v691]; + int32_t *v1130 = &v6[v699]; + int32_t *v1139 = &v6[v707]; + int32_t *v1148 = &v6[v715]; + int32_t *v1157 = &v6[v723]; + int32_t *v1166 = &v6[v731]; + int32_t *v1175 = &v6[v739]; + int32_t *v1184 = &v6[v747]; + int32_t *v1193 = &v6[v755]; + int32_t *v1202 = &v6[v763]; + int32_t *v1211 = &v6[v771]; + int32_t *v1220 = &v6[v779]; + int32_t *v1229 = &v6[v787]; + int32_t *v1238 = &v6[v795]; + int32_t *v1247 = &v6[v803]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v817[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v826[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v835[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v51 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v844[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v61 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v853[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v69 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v862[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v79 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v871[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v87 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v880[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v97 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v889[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v105 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v898[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v115 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v907[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v123 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v916[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v133 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v925[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v151 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v943[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v159 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v952[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v169 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v961[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v177 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v970[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v187 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v979[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v195 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v988[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v205 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v997[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v213 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1006[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v70 = svadd_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v71 = svsub_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v106 = svadd_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v151, v159); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v151, v159); + svfloat32_t v178 = svadd_f32_x(svptrue_b32(), v169, v177); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v169, v177); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v187, v195); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v187, v195); + svfloat32_t v214 = svadd_f32_x(svptrue_b32(), v205, v213); + svfloat32_t v215 = svsub_f32_x(svptrue_b32(), v205, v213); + svfloat32_t v216 = svadd_f32_x(svptrue_b32(), v52, v214); + svfloat32_t v217 = svadd_f32_x(svptrue_b32(), v70, v196); + svfloat32_t v218 = svadd_f32_x(svptrue_b32(), v88, v178); + svfloat32_t v219 = svadd_f32_x(svptrue_b32(), v106, v160); + svfloat32_t v220 = svadd_f32_x(svptrue_b32(), v124, v142); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v52, v214); + svfloat32_t v222 = svsub_f32_x(svptrue_b32(), v70, v196); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v88, v178); + svfloat32_t v224 = svsub_f32_x(svptrue_b32(), v106, v160); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v124, v142); + svfloat32_t v425 = svadd_f32_x(svptrue_b32(), v53, v215); + svfloat32_t v426 = svadd_f32_x(svptrue_b32(), v71, v197); + svfloat32_t v427 = svadd_f32_x(svptrue_b32(), v89, v179); + svfloat32_t v428 = svadd_f32_x(svptrue_b32(), v107, v161); + svfloat32_t v429 = svadd_f32_x(svptrue_b32(), v125, v143); + svfloat32_t v430 = svsub_f32_x(svptrue_b32(), v53, v215); + svfloat32_t v431 = svsub_f32_x(svptrue_b32(), v71, v197); + svfloat32_t v432 = svsub_f32_x(svptrue_b32(), v89, v179); + svfloat32_t v433 = svsub_f32_x(svptrue_b32(), v107, v161); + svfloat32_t v434 = svsub_f32_x(svptrue_b32(), v125, v143); + svfloat32_t v226 = svadd_f32_x(svptrue_b32(), v216, v217); + svfloat32_t v227 = svadd_f32_x(svptrue_b32(), v218, v220); + svfloat32_t v229 = svsub_f32_x(svptrue_b32(), v222, v223); + svfloat32_t v230 = svadd_f32_x(svptrue_b32(), v221, v225); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v217, v219); + svfloat32_t v236 = svsub_f32_x(svptrue_b32(), v216, v219); + svfloat32_t v237 = svsub_f32_x(svptrue_b32(), v217, v216); + svfloat32_t v238 = svsub_f32_x(svptrue_b32(), v220, v219); + svfloat32_t v239 = svsub_f32_x(svptrue_b32(), v218, v219); + svfloat32_t v240 = svsub_f32_x(svptrue_b32(), v220, v218); + svfloat32_t v241 = svsub_f32_x(svptrue_b32(), v217, v220); + svfloat32_t v242 = svsub_f32_x(svptrue_b32(), v216, v218); + svfloat32_t v244 = svadd_f32_x(svptrue_b32(), v222, v224); + svfloat32_t v245 = svsub_f32_x(svptrue_b32(), v221, v224); + svfloat32_t v246 = svadd_f32_x(svptrue_b32(), v221, v222); + svfloat32_t v247 = svsub_f32_x(svptrue_b32(), v224, v225); + svfloat32_t v248 = svsub_f32_x(svptrue_b32(), v223, v224); + svfloat32_t v249 = svsub_f32_x(svptrue_b32(), v223, v225); + svfloat32_t v250 = svadd_f32_x(svptrue_b32(), v222, v225); + svfloat32_t v251 = svsub_f32_x(svptrue_b32(), v221, v223); + svfloat32_t v435 = svadd_f32_x(svptrue_b32(), v425, v426); + svfloat32_t v436 = svadd_f32_x(svptrue_b32(), v427, v429); + svfloat32_t v438 = svsub_f32_x(svptrue_b32(), v431, v432); + svfloat32_t v439 = svadd_f32_x(svptrue_b32(), v430, v434); + svfloat32_t v444 = svsub_f32_x(svptrue_b32(), v426, v428); + svfloat32_t v445 = svsub_f32_x(svptrue_b32(), v425, v428); + svfloat32_t v446 = svsub_f32_x(svptrue_b32(), v426, v425); + svfloat32_t v447 = svsub_f32_x(svptrue_b32(), v429, v428); + svfloat32_t v448 = svsub_f32_x(svptrue_b32(), v427, v428); + svfloat32_t v449 = svsub_f32_x(svptrue_b32(), v429, v427); + svfloat32_t v450 = svsub_f32_x(svptrue_b32(), v426, v429); + svfloat32_t v451 = svsub_f32_x(svptrue_b32(), v425, v427); + svfloat32_t v453 = svadd_f32_x(svptrue_b32(), v431, v433); + svfloat32_t v454 = svsub_f32_x(svptrue_b32(), v430, v433); + svfloat32_t v455 = svadd_f32_x(svptrue_b32(), v430, v431); + svfloat32_t v456 = svsub_f32_x(svptrue_b32(), v433, v434); + svfloat32_t v457 = svsub_f32_x(svptrue_b32(), v432, v433); + svfloat32_t v458 = svsub_f32_x(svptrue_b32(), v432, v434); + svfloat32_t v459 = svadd_f32_x(svptrue_b32(), v431, v434); + svfloat32_t v460 = svsub_f32_x(svptrue_b32(), v430, v432); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v219, v226); + svfloat32_t v233 = svsub_f32_x(svptrue_b32(), v229, v230); + svfloat32_t v243 = svsub_f32_x(svptrue_b32(), v227, v226); + svfloat32_t v252 = svadd_f32_x(svptrue_b32(), v229, v230); + svfloat32_t v279 = svmul_f32_x(svptrue_b32(), v236, v1034); + svfloat32_t v284 = svmul_f32_x(svptrue_b32(), v237, v1035); + svfloat32_t v294 = svmul_f32_x(svptrue_b32(), v239, v1037); + svfloat32_t v299 = svmul_f32_x(svptrue_b32(), v240, v1038); + svfloat32_t zero321 = svdup_n_f32(0); + svfloat32_t v321 = svcmla_f32_x(pred_full, zero321, v1042, v244, 90); + svfloat32_t zero335 = svdup_n_f32(0); + svfloat32_t v335 = svcmla_f32_x(pred_full, zero335, v1044, v246, 90); + svfloat32_t zero342 = svdup_n_f32(0); + svfloat32_t v342 = svcmla_f32_x(pred_full, zero342, v1045, v247, 90); + svfloat32_t zero356 = svdup_n_f32(0); + svfloat32_t v356 = svcmla_f32_x(pred_full, zero356, v1047, v249, 90); + svfloat32_t zero363 = svdup_n_f32(0); + svfloat32_t v363 = svcmla_f32_x(pred_full, zero363, v1048, v250, 90); + svfloat32_t v437 = svadd_f32_x(svptrue_b32(), v428, v435); + svfloat32_t v442 = svsub_f32_x(svptrue_b32(), v438, v439); + svfloat32_t v452 = svsub_f32_x(svptrue_b32(), v436, v435); + svfloat32_t v461 = svadd_f32_x(svptrue_b32(), v438, v439); + svfloat32_t v488 = svmul_f32_x(svptrue_b32(), v445, v1034); + svfloat32_t v493 = svmul_f32_x(svptrue_b32(), v446, v1035); + svfloat32_t v503 = svmul_f32_x(svptrue_b32(), v448, v1037); + svfloat32_t v508 = svmul_f32_x(svptrue_b32(), v449, v1038); + svfloat32_t zero530 = svdup_n_f32(0); + svfloat32_t v530 = svcmla_f32_x(pred_full, zero530, v1042, v453, 90); + svfloat32_t zero544 = svdup_n_f32(0); + svfloat32_t v544 = svcmla_f32_x(pred_full, zero544, v1044, v455, 90); + svfloat32_t zero551 = svdup_n_f32(0); + svfloat32_t v551 = svcmla_f32_x(pred_full, zero551, v1045, v456, 90); + svfloat32_t zero565 = svdup_n_f32(0); + svfloat32_t v565 = svcmla_f32_x(pred_full, zero565, v1047, v458, 90); + svfloat32_t zero572 = svdup_n_f32(0); + svfloat32_t v572 = svcmla_f32_x(pred_full, zero572, v1048, v459, 90); + svfloat32_t v231 = svadd_f32_x(svptrue_b32(), v228, v227); + svfloat32_t v234 = svsub_f32_x(svptrue_b32(), v233, v224); + svfloat32_t v314 = svmul_f32_x(svptrue_b32(), v243, v1041); + svfloat32_t zero377 = svdup_n_f32(0); + svfloat32_t v377 = svcmla_f32_x(pred_full, zero377, v1050, v252, 90); + svfloat32_t v379 = svmla_f32_x(pred_full, v279, v235, v1033); + svfloat32_t v380 = svmla_f32_x(pred_full, v284, v236, v1034); + svfloat32_t v381 = svnmls_f32_x(pred_full, v284, v235, v1033); + svfloat32_t v382 = svmla_f32_x(pred_full, v294, v238, v1036); + svfloat32_t v383 = svmla_f32_x(pred_full, v299, v239, v1037); + svfloat32_t v384 = svnmls_f32_x(pred_full, v299, v238, v1036); + svfloat32_t v387 = svcmla_f32_x(pred_full, v335, v1043, v245, 90); + svfloat32_t v388 = svsub_f32_x(svptrue_b32(), v321, v335); + svfloat32_t v389 = svcmla_f32_x(pred_full, v356, v1046, v248, 90); + svfloat32_t v390 = svsub_f32_x(svptrue_b32(), v342, v356); + svfloat32_t v440 = svadd_f32_x(svptrue_b32(), v437, v436); + svfloat32_t v443 = svsub_f32_x(svptrue_b32(), v442, v433); + svfloat32_t v523 = svmul_f32_x(svptrue_b32(), v452, v1041); + svfloat32_t zero586 = svdup_n_f32(0); + svfloat32_t v586 = svcmla_f32_x(pred_full, zero586, v1050, v461, 90); + svfloat32_t v588 = svmla_f32_x(pred_full, v488, v444, v1033); + svfloat32_t v589 = svmla_f32_x(pred_full, v493, v445, v1034); + svfloat32_t v590 = svnmls_f32_x(pred_full, v493, v444, v1033); + svfloat32_t v591 = svmla_f32_x(pred_full, v503, v447, v1036); + svfloat32_t v592 = svmla_f32_x(pred_full, v508, v448, v1037); + svfloat32_t v593 = svnmls_f32_x(pred_full, v508, v447, v1036); + svfloat32_t v596 = svcmla_f32_x(pred_full, v544, v1043, v454, 90); + svfloat32_t v597 = svsub_f32_x(svptrue_b32(), v530, v544); + svfloat32_t v598 = svcmla_f32_x(pred_full, v565, v1046, v457, 90); + svfloat32_t v599 = svsub_f32_x(svptrue_b32(), v551, v565); + svfloat32_t v232 = svadd_f32_x(svptrue_b32(), v34, v231); + svfloat32_t zero269 = svdup_n_f32(0); + svfloat32_t v269 = svcmla_f32_x(pred_full, zero269, v1032, v234, 90); + svfloat32_t v385 = svmla_f32_x(pred_full, v314, v242, v1040); + svfloat32_t v386 = svmla_f32_x(pred_full, v314, v241, v1039); + svfloat32_t v391 = svcmla_f32_x(pred_full, v377, v1049, v251, 90); + svfloat32_t v392 = svsub_f32_x(svptrue_b32(), v363, v377); + svfloat32_t v411 = svadd_f32_x(svptrue_b32(), v387, v388); + svfloat32_t v441 = svadd_f32_x(svptrue_b32(), v35, v440); + svfloat32_t zero478 = svdup_n_f32(0); + svfloat32_t v478 = svcmla_f32_x(pred_full, zero478, v1032, v443, 90); + svfloat32_t v594 = svmla_f32_x(pred_full, v523, v451, v1040); + svfloat32_t v595 = svmla_f32_x(pred_full, v523, v450, v1039); + svfloat32_t v600 = svcmla_f32_x(pred_full, v586, v1049, v460, 90); + svfloat32_t v601 = svsub_f32_x(svptrue_b32(), v572, v586); + svfloat32_t v620 = svadd_f32_x(svptrue_b32(), v596, v597); + svfloat32_t v378 = svmls_f32_x(pred_full, v232, v231, v1031); + svfloat32_t v393 = svadd_f32_x(svptrue_b32(), v383, v385); + svfloat32_t v403 = svadd_f32_x(svptrue_b32(), v269, v389); + svfloat32_t v405 = svsub_f32_x(svptrue_b32(), v391, v387); + svfloat32_t v407 = svadd_f32_x(svptrue_b32(), v269, v392); + svfloat32_t v409 = svsub_f32_x(svptrue_b32(), v392, v388); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v411, v389); + svfloat32_t v587 = svmls_f32_x(pred_full, v441, v440, v1031); + svfloat32_t v602 = svadd_f32_x(svptrue_b32(), v592, v594); + svfloat32_t v612 = svadd_f32_x(svptrue_b32(), v478, v598); + svfloat32_t v614 = svsub_f32_x(svptrue_b32(), v600, v596); + svfloat32_t v616 = svadd_f32_x(svptrue_b32(), v478, v601); + svfloat32_t v618 = svsub_f32_x(svptrue_b32(), v601, v597); + svfloat32_t v621 = svadd_f32_x(svptrue_b32(), v620, v598); + svint16_t v636 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v232, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v644 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v441, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v394 = svadd_f32_x(svptrue_b32(), v393, v378); + svfloat32_t v395 = svsub_f32_x(svptrue_b32(), v378, v380); + svfloat32_t v397 = svadd_f32_x(svptrue_b32(), v378, v384); + svfloat32_t v399 = svsub_f32_x(svptrue_b32(), v378, v381); + svfloat32_t v401 = svadd_f32_x(svptrue_b32(), v378, v379); + svfloat32_t v404 = svadd_f32_x(svptrue_b32(), v403, v391); + svfloat32_t v406 = svsub_f32_x(svptrue_b32(), v405, v269); + svfloat32_t v408 = svadd_f32_x(svptrue_b32(), v407, v390); + svfloat32_t v410 = svsub_f32_x(svptrue_b32(), v409, v269); + svfloat32_t v413 = svadd_f32_x(svptrue_b32(), v412, v390); + svfloat32_t v603 = svadd_f32_x(svptrue_b32(), v602, v587); + svfloat32_t v604 = svsub_f32_x(svptrue_b32(), v587, v589); + svfloat32_t v606 = svadd_f32_x(svptrue_b32(), v587, v593); + svfloat32_t v608 = svsub_f32_x(svptrue_b32(), v587, v590); + svfloat32_t v610 = svadd_f32_x(svptrue_b32(), v587, v588); + svfloat32_t v613 = svadd_f32_x(svptrue_b32(), v612, v600); + svfloat32_t v615 = svsub_f32_x(svptrue_b32(), v614, v478); + svfloat32_t v617 = svadd_f32_x(svptrue_b32(), v616, v599); + svfloat32_t v619 = svsub_f32_x(svptrue_b32(), v618, v478); + svfloat32_t v622 = svadd_f32_x(svptrue_b32(), v621, v599); + svst1w_u64(pred_full, (unsigned *)(v1058), svreinterpret_u64_s16(v636)); + svst1w_u64(pred_full, (unsigned *)(v1067), svreinterpret_u64_s16(v644)); + svfloat32_t v396 = svsub_f32_x(svptrue_b32(), v395, v385); + svfloat32_t v398 = svadd_f32_x(svptrue_b32(), v397, v386); + svfloat32_t v400 = svsub_f32_x(svptrue_b32(), v399, v386); + svfloat32_t v402 = svsub_f32_x(svptrue_b32(), v401, v382); + svfloat32_t v414 = svsub_f32_x(svptrue_b32(), v413, v269); + svfloat32_t v416 = svadd_f32_x(svptrue_b32(), v394, v404); + svfloat32_t v423 = svsub_f32_x(svptrue_b32(), v394, v404); + svfloat32_t v605 = svsub_f32_x(svptrue_b32(), v604, v594); + svfloat32_t v607 = svadd_f32_x(svptrue_b32(), v606, v595); + svfloat32_t v609 = svsub_f32_x(svptrue_b32(), v608, v595); + svfloat32_t v611 = svsub_f32_x(svptrue_b32(), v610, v591); + svfloat32_t v623 = svsub_f32_x(svptrue_b32(), v622, v478); + svfloat32_t v625 = svadd_f32_x(svptrue_b32(), v603, v613); + svfloat32_t v632 = svsub_f32_x(svptrue_b32(), v603, v613); + svfloat32_t v415 = svadd_f32_x(svptrue_b32(), v402, v414); + svfloat32_t v417 = svadd_f32_x(svptrue_b32(), v396, v406); + svfloat32_t v418 = svsub_f32_x(svptrue_b32(), v398, v408); + svfloat32_t v419 = svadd_f32_x(svptrue_b32(), v400, v410); + svfloat32_t v420 = svsub_f32_x(svptrue_b32(), v400, v410); + svfloat32_t v421 = svadd_f32_x(svptrue_b32(), v398, v408); + svfloat32_t v422 = svsub_f32_x(svptrue_b32(), v396, v406); + svfloat32_t v424 = svsub_f32_x(svptrue_b32(), v402, v414); + svfloat32_t v624 = svadd_f32_x(svptrue_b32(), v611, v623); + svfloat32_t v626 = svadd_f32_x(svptrue_b32(), v605, v615); + svfloat32_t v627 = svsub_f32_x(svptrue_b32(), v607, v617); + svfloat32_t v628 = svadd_f32_x(svptrue_b32(), v609, v619); + svfloat32_t v629 = svsub_f32_x(svptrue_b32(), v609, v619); + svfloat32_t v630 = svadd_f32_x(svptrue_b32(), v607, v617); + svfloat32_t v631 = svsub_f32_x(svptrue_b32(), v605, v615); + svfloat32_t v633 = svsub_f32_x(svptrue_b32(), v611, v623); + svint16_t v668 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v423, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v676 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v632, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v780 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v416, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v788 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v625, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v652 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v424, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v660 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v633, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v684 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v422, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v692 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v631, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v700 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v421, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v708 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v630, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v716 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v420, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v724 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v629, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v732 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v419, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v740 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v628, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v748 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v418, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v756 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v627, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v764 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v417, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v772 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v626, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v796 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v415, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v804 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v624, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v1094), svreinterpret_u64_s16(v668)); + svst1w_u64(pred_full, (unsigned *)(v1103), svreinterpret_u64_s16(v676)); + svst1w_u64(pred_full, (unsigned *)(v1220), svreinterpret_u64_s16(v780)); + svst1w_u64(pred_full, (unsigned *)(v1229), svreinterpret_u64_s16(v788)); + svst1w_u64(pred_full, (unsigned *)(v1076), svreinterpret_u64_s16(v652)); + svst1w_u64(pred_full, (unsigned *)(v1085), svreinterpret_u64_s16(v660)); + svst1w_u64(pred_full, (unsigned *)(v1112), svreinterpret_u64_s16(v684)); + svst1w_u64(pred_full, (unsigned *)(v1121), svreinterpret_u64_s16(v692)); + svst1w_u64(pred_full, (unsigned *)(v1130), svreinterpret_u64_s16(v700)); + svst1w_u64(pred_full, (unsigned *)(v1139), svreinterpret_u64_s16(v708)); + svst1w_u64(pred_full, (unsigned *)(v1148), svreinterpret_u64_s16(v716)); + svst1w_u64(pred_full, (unsigned *)(v1157), svreinterpret_u64_s16(v724)); + svst1w_u64(pred_full, (unsigned *)(v1166), svreinterpret_u64_s16(v732)); + svst1w_u64(pred_full, (unsigned *)(v1175), svreinterpret_u64_s16(v740)); + svst1w_u64(pred_full, (unsigned *)(v1184), svreinterpret_u64_s16(v748)); + svst1w_u64(pred_full, (unsigned *)(v1193), svreinterpret_u64_s16(v756)); + svst1w_u64(pred_full, (unsigned *)(v1202), svreinterpret_u64_s16(v764)); + svst1w_u64(pred_full, (unsigned *)(v1211), svreinterpret_u64_s16(v772)); + svst1w_u64(pred_full, (unsigned *)(v1238), svreinterpret_u64_s16(v796)); + svst1w_u64(pred_full, (unsigned *)(v1247), svreinterpret_u64_s16(v804)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu24(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v89 = vld1s_s16(&v5[istride]); + float v223 = 1.0000000000000000e+00F; + float v224 = -1.0000000000000000e+00F; + float v231 = -7.0710678118654746e-01F; + float v238 = 7.0710678118654757e-01F; + float v290 = -1.4999999999999998e+00F; + float v291 = 1.4999999999999998e+00F; + float v298 = 1.0606601717798210e+00F; + float v305 = -1.0606601717798212e+00F; + float v359 = 8.6602540378443871e-01F; + float v367 = -8.6602540378443871e-01F; + float v374 = 6.1237243569579458e-01F; + float v375 = -6.1237243569579458e-01F; + float32x2_t v377 = (float32x2_t){v4, v4}; + int16x4_t v34 = vld1s_s16(&v5[0]); + float32x2_t v90 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v89)), 15); + float32x2_t v225 = (float32x2_t){v223, v224}; + float32x2_t v232 = (float32x2_t){v238, v231}; + float32x2_t v239 = (float32x2_t){v238, v238}; + float32x2_t v288 = (float32x2_t){v290, v290}; + float32x2_t v292 = (float32x2_t){v290, v291}; + float32x2_t v299 = (float32x2_t){v305, v298}; + float32x2_t v306 = (float32x2_t){v305, v305}; + float32x2_t v361 = (float32x2_t){v359, v367}; + float32x2_t v368 = (float32x2_t){v367, v367}; + float32x2_t v372 = (float32x2_t){v375, v375}; + float32x2_t v376 = (float32x2_t){v374, v375}; + int16x4_t v20 = vld1s_s16(&v5[istride * 8]); + int16x4_t v26 = vld1s_s16(&v5[istride * 16]); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + int16x4_t v41 = vld1s_s16(&v5[istride * 11]); + int16x4_t v47 = vld1s_s16(&v5[istride * 19]); + int16x4_t v55 = vld1s_s16(&v5[istride * 3]); + int16x4_t v62 = vld1s_s16(&v5[istride * 14]); + int16x4_t v68 = vld1s_s16(&v5[istride * 22]); + int16x4_t v76 = vld1s_s16(&v5[istride * 6]); + int16x4_t v83 = vld1s_s16(&v5[istride * 17]); + int16x4_t v97 = vld1s_s16(&v5[istride * 9]); + int16x4_t v104 = vld1s_s16(&v5[istride * 20]); + int16x4_t v110 = vld1s_s16(&v5[istride * 4]); + int16x4_t v118 = vld1s_s16(&v5[istride * 12]); + int16x4_t v125 = vld1s_s16(&v5[istride * 23]); + int16x4_t v131 = vld1s_s16(&v5[istride * 7]); + int16x4_t v139 = vld1s_s16(&v5[istride * 15]); + int16x4_t v146 = vld1s_s16(&v5[istride * 2]); + int16x4_t v152 = vld1s_s16(&v5[istride * 10]); + int16x4_t v160 = vld1s_s16(&v5[istride * 18]); + int16x4_t v167 = vld1s_s16(&v5[istride * 5]); + int16x4_t v173 = vld1s_s16(&v5[istride * 13]); + int16x4_t v181 = vld1s_s16(&v5[istride * 21]); + float32x2_t v227 = vmul_f32(v377, v225); + float32x2_t v234 = vmul_f32(v377, v232); + float32x2_t v294 = vmul_f32(v377, v292); + float32x2_t v301 = vmul_f32(v377, v299); + float32x2_t v363 = vmul_f32(v377, v361); + float32x2_t v378 = vmul_f32(v377, v376); + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v42 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v41)), 15); + float32x2_t v48 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v47)), 15); + float32x2_t v56 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v55)), 15); + float32x2_t v63 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v62)), 15); + float32x2_t v69 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v68)), 15); + float32x2_t v77 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v76)), 15); + float32x2_t v84 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v83)), 15); + float32x2_t v98 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v97)), 15); + float32x2_t v105 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v104)), 15); + float32x2_t v111 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v110)), 15); + float32x2_t v119 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v118)), 15); + float32x2_t v126 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v125)), 15); + float32x2_t v132 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v131)), 15); + float32x2_t v140 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v139)), 15); + float32x2_t v147 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v146)), 15); + float32x2_t v153 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v152)), 15); + float32x2_t v161 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v160)), 15); + float32x2_t v168 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v167)), 15); + float32x2_t v174 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v173)), 15); + float32x2_t v182 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v181)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v49 = vadd_f32(v42, v48); + float32x2_t v50 = vsub_f32(v42, v48); + float32x2_t v70 = vadd_f32(v63, v69); + float32x2_t v71 = vsub_f32(v63, v69); + float32x2_t v91 = vadd_f32(v84, v90); + float32x2_t v92 = vsub_f32(v84, v90); + float32x2_t v112 = vadd_f32(v105, v111); + float32x2_t v113 = vsub_f32(v105, v111); + float32x2_t v133 = vadd_f32(v126, v132); + float32x2_t v134 = vsub_f32(v126, v132); + float32x2_t v154 = vadd_f32(v147, v153); + float32x2_t v155 = vsub_f32(v147, v153); + float32x2_t v175 = vadd_f32(v168, v174); + float32x2_t v176 = vsub_f32(v168, v174); + float32x2_t v36 = vadd_f32(v28, v35); + float32x2_t v57 = vadd_f32(v49, v56); + float32x2_t v78 = vadd_f32(v70, v77); + float32x2_t v99 = vadd_f32(v91, v98); + float32x2_t v120 = vadd_f32(v112, v119); + float32x2_t v141 = vadd_f32(v133, v140); + float32x2_t v162 = vadd_f32(v154, v161); + float32x2_t v183 = vadd_f32(v175, v182); + float32x2_t v251 = vadd_f32(v28, v112); + float32x2_t v252 = vsub_f32(v28, v112); + float32x2_t v253 = vadd_f32(v70, v154); + float32x2_t v254 = vsub_f32(v70, v154); + float32x2_t v255 = vadd_f32(v49, v133); + float32x2_t v256 = vsub_f32(v49, v133); + float32x2_t v257 = vadd_f32(v91, v175); + float32x2_t v258 = vsub_f32(v91, v175); + float32x2_t v318 = vadd_f32(v29, v113); + float32x2_t v319 = vsub_f32(v29, v113); + float32x2_t v320 = vadd_f32(v71, v155); + float32x2_t v321 = vsub_f32(v71, v155); + float32x2_t v322 = vadd_f32(v50, v134); + float32x2_t v323 = vsub_f32(v50, v134); + float32x2_t v324 = vadd_f32(v92, v176); + float32x2_t v325 = vsub_f32(v92, v176); + float32x2_t v184 = vadd_f32(v36, v120); + float32x2_t v185 = vsub_f32(v36, v120); + float32x2_t v186 = vadd_f32(v78, v162); + float32x2_t v187 = vsub_f32(v78, v162); + float32x2_t v188 = vadd_f32(v57, v141); + float32x2_t v189 = vsub_f32(v57, v141); + float32x2_t v190 = vadd_f32(v99, v183); + float32x2_t v191 = vsub_f32(v99, v183); + float32x2_t v259 = vadd_f32(v251, v253); + float32x2_t v260 = vsub_f32(v251, v253); + float32x2_t v261 = vadd_f32(v255, v257); + float32x2_t v262 = vsub_f32(v255, v257); + float32x2_t v265 = vadd_f32(v256, v258); + float32x2_t v266 = vsub_f32(v256, v258); + float32x2_t v289 = vmul_f32(v252, v288); + float32x2_t v295 = vrev64_f32(v254); + float32x2_t v326 = vadd_f32(v318, v320); + float32x2_t v327 = vsub_f32(v318, v320); + float32x2_t v328 = vadd_f32(v322, v324); + float32x2_t v329 = vsub_f32(v322, v324); + float32x2_t v332 = vadd_f32(v323, v325); + float32x2_t v333 = vsub_f32(v323, v325); + float32x2_t v364 = vrev64_f32(v319); + float32x2_t v369 = vmul_f32(v321, v368); + float32x2_t v192 = vadd_f32(v184, v186); + float32x2_t v193 = vsub_f32(v184, v186); + float32x2_t v194 = vadd_f32(v188, v190); + float32x2_t v195 = vsub_f32(v188, v190); + float32x2_t v198 = vadd_f32(v189, v191); + float32x2_t v199 = vsub_f32(v189, v191); + float32x2_t v228 = vrev64_f32(v187); + float32x2_t v263 = vadd_f32(v259, v261); + float32x2_t v264 = vsub_f32(v259, v261); + float32x2_t v278 = vmul_f32(v260, v288); + float32x2_t v284 = vrev64_f32(v262); + float32x2_t v296 = vmul_f32(v295, v294); + float32x2_t v302 = vrev64_f32(v265); + float32x2_t v307 = vmul_f32(v266, v306); + float32x2_t v330 = vadd_f32(v326, v328); + float32x2_t v331 = vsub_f32(v326, v328); + float32x2_t v353 = vrev64_f32(v327); + float32x2_t v358 = vmul_f32(v329, v368); + float32x2_t v365 = vmul_f32(v364, v363); + float32x2_t v373 = vmul_f32(v332, v372); + float32x2_t v379 = vrev64_f32(v333); + float32x2_t v196 = vadd_f32(v192, v194); + float32x2_t v197 = vsub_f32(v192, v194); + float32x2_t v217 = vrev64_f32(v195); + float32x2_t v229 = vmul_f32(v228, v227); + float32x2_t v235 = vrev64_f32(v198); + float32x2_t v240 = vmul_f32(v199, v239); + float32x2_t v270 = vmul_f32(v263, v288); + float32x2_t v274 = vmul_f32(v264, v288); + float32x2_t v285 = vmul_f32(v284, v294); + float32x2_t v303 = vmul_f32(v302, v301); + float32x2_t v310 = vadd_f32(v289, v307); + float32x2_t v311 = vsub_f32(v289, v307); + float32x2_t v339 = vrev64_f32(v330); + float32x2_t v346 = vrev64_f32(v331); + float32x2_t v354 = vmul_f32(v353, v363); + float32x2_t v380 = vmul_f32(v379, v378); + float32x2_t v385 = vadd_f32(v369, v373); + float32x2_t v386 = vsub_f32(v369, v373); + float32x2_t v218 = vmul_f32(v217, v227); + float32x2_t v236 = vmul_f32(v235, v234); + float32x2_t v243 = vadd_f32(v185, v240); + float32x2_t v244 = vsub_f32(v185, v240); + float32x2_t v308 = vadd_f32(v278, v285); + float32x2_t v309 = vsub_f32(v278, v285); + float32x2_t v312 = vadd_f32(v296, v303); + float32x2_t v313 = vsub_f32(v296, v303); + float32x2_t v340 = vmul_f32(v339, v363); + float32x2_t v347 = vmul_f32(v346, v363); + float32x2_t v381 = vadd_f32(v354, v358); + float32x2_t v382 = vsub_f32(v354, v358); + float32x2_t v383 = vadd_f32(v365, v380); + float32x2_t v384 = vsub_f32(v365, v380); + float32x2_t v391 = vadd_f32(v196, v270); + int16x4_t v396 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v196, 15), (int32x2_t){0, 0})); + float32x2_t v475 = vadd_f32(v197, v274); + int16x4_t v480 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v197, 15), (int32x2_t){0, 0})); + float32x2_t v241 = vadd_f32(v193, v218); + float32x2_t v242 = vsub_f32(v193, v218); + float32x2_t v245 = vadd_f32(v229, v236); + float32x2_t v246 = vsub_f32(v229, v236); + float32x2_t v314 = vadd_f32(v310, v312); + float32x2_t v315 = vsub_f32(v310, v312); + float32x2_t v316 = vadd_f32(v311, v313); + float32x2_t v317 = vsub_f32(v311, v313); + float32x2_t v387 = vadd_f32(v383, v385); + float32x2_t v388 = vsub_f32(v383, v385); + float32x2_t v389 = vadd_f32(v384, v386); + float32x2_t v390 = vsub_f32(v384, v386); + float32x2_t v392 = vadd_f32(v391, v340); + float32x2_t v393 = vsub_f32(v391, v340); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v396), 0); + float32x2_t v476 = vadd_f32(v475, v347); + float32x2_t v477 = vsub_f32(v475, v347); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v480), 0); + float32x2_t v247 = vadd_f32(v243, v245); + float32x2_t v248 = vsub_f32(v243, v245); + float32x2_t v249 = vadd_f32(v244, v246); + float32x2_t v250 = vsub_f32(v244, v246); + int16x4_t v402 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v393, 15), (int32x2_t){0, 0})); + int16x4_t v408 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v392, 15), (int32x2_t){0, 0})); + float32x2_t v433 = vadd_f32(v242, v309); + int16x4_t v438 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v242, 15), (int32x2_t){0, 0})); + int16x4_t v486 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v477, 15), (int32x2_t){0, 0})); + int16x4_t v492 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v476, 15), (int32x2_t){0, 0})); + float32x2_t v517 = vadd_f32(v241, v308); + int16x4_t v522 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v241, 15), (int32x2_t){0, 0})); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v402), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v408), 0); + float32x2_t v412 = vadd_f32(v248, v315); + int16x4_t v417 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v248, 15), (int32x2_t){0, 0})); + float32x2_t v434 = vadd_f32(v433, v382); + float32x2_t v435 = vsub_f32(v433, v382); + v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v438), 0); + float32x2_t v454 = vadd_f32(v249, v316); + int16x4_t v459 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v249, 15), (int32x2_t){0, 0})); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v486), 0); + v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v492), 0); + float32x2_t v496 = vadd_f32(v250, v317); + int16x4_t v501 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v250, 15), (int32x2_t){0, 0})); + float32x2_t v518 = vadd_f32(v517, v381); + float32x2_t v519 = vsub_f32(v517, v381); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v522), 0); + float32x2_t v538 = vadd_f32(v247, v314); + int16x4_t v543 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v247, 15), (int32x2_t){0, 0})); + float32x2_t v413 = vadd_f32(v412, v388); + float32x2_t v414 = vsub_f32(v412, v388); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v417), 0); + int16x4_t v444 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v435, 15), (int32x2_t){0, 0})); + int16x4_t v450 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v434, 15), (int32x2_t){0, 0})); + float32x2_t v455 = vadd_f32(v454, v389); + float32x2_t v456 = vsub_f32(v454, v389); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v459), 0); + float32x2_t v497 = vadd_f32(v496, v390); + float32x2_t v498 = vsub_f32(v496, v390); + v6[ostride * 21] = vget_lane_s32(vreinterpret_s32_s16(v501), 0); + int16x4_t v528 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v519, 15), (int32x2_t){0, 0})); + int16x4_t v534 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v518, 15), (int32x2_t){0, 0})); + float32x2_t v539 = vadd_f32(v538, v387); + float32x2_t v540 = vsub_f32(v538, v387); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v543), 0); + int16x4_t v423 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v414, 15), (int32x2_t){0, 0})); + int16x4_t v429 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v413, 15), (int32x2_t){0, 0})); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v444), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v450), 0); + int16x4_t v465 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v456, 15), (int32x2_t){0, 0})); + int16x4_t v471 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v455, 15), (int32x2_t){0, 0})); + int16x4_t v507 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v498, 15), (int32x2_t){0, 0})); + int16x4_t v513 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v497, 15), (int32x2_t){0, 0})); + v6[ostride * 22] = vget_lane_s32(vreinterpret_s32_s16(v528), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v534), 0); + int16x4_t v549 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v540, 15), (int32x2_t){0, 0})); + int16x4_t v555 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v539, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v423), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v429), 0); + v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v465), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v471), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v507), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v513), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v549), 0); + v6[ostride * 23] = vget_lane_s32(vreinterpret_s32_s16(v555), 0); + v5 += 1 * 1; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu24(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * 1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v278 = -1.0000000000000000e+00F; + float v285 = -7.0710678118654746e-01F; + float v292 = 7.0710678118654757e-01F; + float v345 = -1.4999999999999998e+00F; + float v350 = 1.4999999999999998e+00F; + float v357 = 1.0606601717798210e+00F; + float v364 = -1.0606601717798212e+00F; + float v428 = -8.6602540378443871e-01F; + float v438 = -6.1237243569579458e-01F; + const int32_t *v767 = &v5[v0]; + int32_t *v954 = &v6[v2]; + int64_t v19 = v0 * 8; + int64_t v27 = v0 * 16; + int64_t v46 = v0 * 11; + int64_t v54 = v0 * 19; + int64_t v64 = v0 * 3; + int64_t v73 = v0 * 14; + int64_t v81 = v0 * 22; + int64_t v91 = v0 * 6; + int64_t v100 = v0 * 17; + int64_t v118 = v0 * 9; + int64_t v127 = v0 * 20; + int64_t v135 = v0 * 4; + int64_t v145 = v0 * 12; + int64_t v154 = v0 * 23; + int64_t v162 = v0 * 7; + int64_t v172 = v0 * 15; + int64_t v181 = v0 * 2; + int64_t v189 = v0 * 10; + int64_t v199 = v0 * 18; + int64_t v208 = v0 * 5; + int64_t v216 = v0 * 13; + int64_t v226 = v0 * 21; + float v281 = v4 * v278; + float v288 = v4 * v285; + float v353 = v4 * v350; + float v360 = v4 * v357; + float v424 = v4 * v428; + float v441 = v4 * v438; + int64_t v466 = v2 * 16; + int64_t v474 = v2 * 8; + int64_t v485 = v2 * 9; + int64_t v501 = v2 * 17; + int64_t v512 = v2 * 18; + int64_t v520 = v2 * 10; + int64_t v528 = v2 * 2; + int64_t v539 = v2 * 3; + int64_t v547 = v2 * 19; + int64_t v555 = v2 * 11; + int64_t v566 = v2 * 12; + int64_t v574 = v2 * 4; + int64_t v582 = v2 * 20; + int64_t v593 = v2 * 21; + int64_t v601 = v2 * 13; + int64_t v609 = v2 * 5; + int64_t v620 = v2 * 6; + int64_t v628 = v2 * 22; + int64_t v636 = v2 * 14; + int64_t v647 = v2 * 15; + int64_t v655 = v2 * 7; + int64_t v663 = v2 * 23; + const int32_t *v695 = &v5[0]; + svfloat32_t v894 = svdup_n_f32(v292); + svfloat32_t v899 = svdup_n_f32(v345); + svfloat32_t v902 = svdup_n_f32(v364); + svfloat32_t v908 = svdup_n_f32(v428); + svfloat32_t v909 = svdup_n_f32(v438); + int32_t *v918 = &v6[0]; + svfloat32_t v114 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v767[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v676 = &v5[v19]; + const int32_t *v685 = &v5[v27]; + const int32_t *v704 = &v5[v46]; + const int32_t *v713 = &v5[v54]; + const int32_t *v722 = &v5[v64]; + const int32_t *v731 = &v5[v73]; + const int32_t *v740 = &v5[v81]; + const int32_t *v749 = &v5[v91]; + const int32_t *v758 = &v5[v100]; + const int32_t *v776 = &v5[v118]; + const int32_t *v785 = &v5[v127]; + const int32_t *v794 = &v5[v135]; + const int32_t *v803 = &v5[v145]; + const int32_t *v812 = &v5[v154]; + const int32_t *v821 = &v5[v162]; + const int32_t *v830 = &v5[v172]; + const int32_t *v839 = &v5[v181]; + const int32_t *v848 = &v5[v189]; + const int32_t *v857 = &v5[v199]; + const int32_t *v866 = &v5[v208]; + const int32_t *v875 = &v5[v216]; + const int32_t *v884 = &v5[v226]; + svfloat32_t v892 = svdup_n_f32(v281); + svfloat32_t v893 = svdup_n_f32(v288); + svfloat32_t v900 = svdup_n_f32(v353); + svfloat32_t v901 = svdup_n_f32(v360); + svfloat32_t v907 = svdup_n_f32(v424); + svfloat32_t v910 = svdup_n_f32(v441); + int32_t *v927 = &v6[v466]; + int32_t *v936 = &v6[v474]; + int32_t *v945 = &v6[v485]; + int32_t *v963 = &v6[v501]; + int32_t *v972 = &v6[v512]; + int32_t *v981 = &v6[v520]; + int32_t *v990 = &v6[v528]; + int32_t *v999 = &v6[v539]; + int32_t *v1008 = &v6[v547]; + int32_t *v1017 = &v6[v555]; + int32_t *v1026 = &v6[v566]; + int32_t *v1035 = &v6[v574]; + int32_t *v1044 = &v6[v582]; + int32_t *v1053 = &v6[v593]; + int32_t *v1062 = &v6[v601]; + int32_t *v1071 = &v6[v609]; + int32_t *v1080 = &v6[v620]; + int32_t *v1089 = &v6[v628]; + int32_t *v1098 = &v6[v636]; + int32_t *v1107 = &v6[v647]; + int32_t *v1116 = &v6[v655]; + int32_t *v1125 = &v6[v663]; + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v695[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v676[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v685[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v52 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v704[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v60 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v713[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v70 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v722[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v79 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v731[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v87 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v740[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v97 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v749[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v106 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v758[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v124 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v776[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v133 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v785[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v141 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v794[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v151 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v803[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v160 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v812[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v168 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v821[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v178 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v830[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v187 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v839[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v195 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v848[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v205 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v857[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v214 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v866[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v222 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v875[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v232 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v884[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v61 = svadd_f32_x(svptrue_b32(), v52, v60); + svfloat32_t v62 = svsub_f32_x(svptrue_b32(), v52, v60); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v115 = svadd_f32_x(svptrue_b32(), v106, v114); + svfloat32_t v116 = svsub_f32_x(svptrue_b32(), v106, v114); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v169 = svadd_f32_x(svptrue_b32(), v160, v168); + svfloat32_t v170 = svsub_f32_x(svptrue_b32(), v160, v168); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v187, v195); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v187, v195); + svfloat32_t v223 = svadd_f32_x(svptrue_b32(), v214, v222); + svfloat32_t v224 = svsub_f32_x(svptrue_b32(), v214, v222); + svfloat32_t v44 = svadd_f32_x(svptrue_b32(), v34, v43); + svfloat32_t v71 = svadd_f32_x(svptrue_b32(), v61, v70); + svfloat32_t v98 = svadd_f32_x(svptrue_b32(), v88, v97); + svfloat32_t v125 = svadd_f32_x(svptrue_b32(), v115, v124); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v142, v151); + svfloat32_t v179 = svadd_f32_x(svptrue_b32(), v169, v178); + svfloat32_t v206 = svadd_f32_x(svptrue_b32(), v196, v205); + svfloat32_t v233 = svadd_f32_x(svptrue_b32(), v223, v232); + svfloat32_t v306 = svadd_f32_x(svptrue_b32(), v34, v142); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v34, v142); + svfloat32_t v308 = svadd_f32_x(svptrue_b32(), v88, v196); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v88, v196); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v61, v169); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v61, v169); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v115, v223); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v115, v223); + svfloat32_t v378 = svadd_f32_x(svptrue_b32(), v35, v143); + svfloat32_t v379 = svsub_f32_x(svptrue_b32(), v35, v143); + svfloat32_t v380 = svadd_f32_x(svptrue_b32(), v89, v197); + svfloat32_t v381 = svsub_f32_x(svptrue_b32(), v89, v197); + svfloat32_t v382 = svadd_f32_x(svptrue_b32(), v62, v170); + svfloat32_t v383 = svsub_f32_x(svptrue_b32(), v62, v170); + svfloat32_t v384 = svadd_f32_x(svptrue_b32(), v116, v224); + svfloat32_t v385 = svsub_f32_x(svptrue_b32(), v116, v224); + svfloat32_t v234 = svadd_f32_x(svptrue_b32(), v44, v152); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v44, v152); + svfloat32_t v236 = svadd_f32_x(svptrue_b32(), v98, v206); + svfloat32_t v237 = svsub_f32_x(svptrue_b32(), v98, v206); + svfloat32_t v238 = svadd_f32_x(svptrue_b32(), v71, v179); + svfloat32_t v239 = svsub_f32_x(svptrue_b32(), v71, v179); + svfloat32_t v240 = svadd_f32_x(svptrue_b32(), v125, v233); + svfloat32_t v241 = svsub_f32_x(svptrue_b32(), v125, v233); + svfloat32_t v314 = svadd_f32_x(svptrue_b32(), v306, v308); + svfloat32_t v315 = svsub_f32_x(svptrue_b32(), v306, v308); + svfloat32_t v316 = svadd_f32_x(svptrue_b32(), v310, v312); + svfloat32_t v317 = svsub_f32_x(svptrue_b32(), v310, v312); + svfloat32_t v320 = svadd_f32_x(svptrue_b32(), v311, v313); + svfloat32_t v321 = svsub_f32_x(svptrue_b32(), v311, v313); + svfloat32_t zero355 = svdup_n_f32(0); + svfloat32_t v355 = svcmla_f32_x(pred_full, zero355, v900, v309, 90); + svfloat32_t v386 = svadd_f32_x(svptrue_b32(), v378, v380); + svfloat32_t v387 = svsub_f32_x(svptrue_b32(), v378, v380); + svfloat32_t v388 = svadd_f32_x(svptrue_b32(), v382, v384); + svfloat32_t v389 = svsub_f32_x(svptrue_b32(), v382, v384); + svfloat32_t v392 = svadd_f32_x(svptrue_b32(), v383, v385); + svfloat32_t v393 = svsub_f32_x(svptrue_b32(), v383, v385); + svfloat32_t zero426 = svdup_n_f32(0); + svfloat32_t v426 = svcmla_f32_x(pred_full, zero426, v907, v379, 90); + svfloat32_t v242 = svadd_f32_x(svptrue_b32(), v234, v236); + svfloat32_t v243 = svsub_f32_x(svptrue_b32(), v234, v236); + svfloat32_t v244 = svadd_f32_x(svptrue_b32(), v238, v240); + svfloat32_t v245 = svsub_f32_x(svptrue_b32(), v238, v240); + svfloat32_t v248 = svadd_f32_x(svptrue_b32(), v239, v241); + svfloat32_t v249 = svsub_f32_x(svptrue_b32(), v239, v241); + svfloat32_t zero283 = svdup_n_f32(0); + svfloat32_t v283 = svcmla_f32_x(pred_full, zero283, v892, v237, 90); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v314, v316); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v314, v316); + svfloat32_t zero343 = svdup_n_f32(0); + svfloat32_t v343 = svcmla_f32_x(pred_full, zero343, v900, v317, 90); + svfloat32_t zero362 = svdup_n_f32(0); + svfloat32_t v362 = svcmla_f32_x(pred_full, zero362, v901, v320, 90); + svfloat32_t v367 = svmul_f32_x(svptrue_b32(), v321, v902); + svfloat32_t v390 = svadd_f32_x(svptrue_b32(), v386, v388); + svfloat32_t v391 = svsub_f32_x(svptrue_b32(), v386, v388); + svfloat32_t zero414 = svdup_n_f32(0); + svfloat32_t v414 = svcmla_f32_x(pred_full, zero414, v907, v387, 90); + svfloat32_t v436 = svmul_f32_x(svptrue_b32(), v392, v909); + svfloat32_t zero443 = svdup_n_f32(0); + svfloat32_t v443 = svcmla_f32_x(pred_full, zero443, v910, v393, 90); + svfloat32_t v246 = svadd_f32_x(svptrue_b32(), v242, v244); + svfloat32_t v247 = svsub_f32_x(svptrue_b32(), v242, v244); + svfloat32_t zero271 = svdup_n_f32(0); + svfloat32_t v271 = svcmla_f32_x(pred_full, zero271, v892, v245, 90); + svfloat32_t zero290 = svdup_n_f32(0); + svfloat32_t v290 = svcmla_f32_x(pred_full, zero290, v893, v248, 90); + svfloat32_t v368 = svmla_f32_x(pred_full, v343, v315, v899); + svfloat32_t v369 = svnmls_f32_x(pred_full, v343, v315, v899); + svfloat32_t v370 = svmla_f32_x(pred_full, v367, v307, v899); + svfloat32_t v371 = svnmls_f32_x(pred_full, v367, v307, v899); + svfloat32_t v372 = svadd_f32_x(svptrue_b32(), v355, v362); + svfloat32_t v373 = svsub_f32_x(svptrue_b32(), v355, v362); + svfloat32_t zero400 = svdup_n_f32(0); + svfloat32_t v400 = svcmla_f32_x(pred_full, zero400, v907, v390, 90); + svfloat32_t zero407 = svdup_n_f32(0); + svfloat32_t v407 = svcmla_f32_x(pred_full, zero407, v907, v391, 90); + svfloat32_t v444 = svmla_f32_x(pred_full, v414, v389, v908); + svfloat32_t v445 = svmls_f32_x(pred_full, v414, v389, v908); + svfloat32_t v446 = svadd_f32_x(svptrue_b32(), v426, v443); + svfloat32_t v447 = svsub_f32_x(svptrue_b32(), v426, v443); + svfloat32_t v448 = svmla_f32_x(pred_full, v436, v381, v908); + svfloat32_t v449 = svnmls_f32_x(pred_full, v436, v381, v908); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v243, v271); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v243, v271); + svfloat32_t v298 = svmla_f32_x(pred_full, v235, v249, v894); + svfloat32_t v299 = svmls_f32_x(pred_full, v235, v249, v894); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v374 = svadd_f32_x(svptrue_b32(), v370, v372); + svfloat32_t v375 = svsub_f32_x(svptrue_b32(), v370, v372); + svfloat32_t v376 = svadd_f32_x(svptrue_b32(), v371, v373); + svfloat32_t v377 = svsub_f32_x(svptrue_b32(), v371, v373); + svfloat32_t v450 = svadd_f32_x(svptrue_b32(), v446, v448); + svfloat32_t v451 = svsub_f32_x(svptrue_b32(), v446, v448); + svfloat32_t v452 = svadd_f32_x(svptrue_b32(), v447, v449); + svfloat32_t v453 = svsub_f32_x(svptrue_b32(), v447, v449); + svfloat32_t v454 = svmla_f32_x(pred_full, v246, v318, v899); + svint16_t v459 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v246, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v562 = svmla_f32_x(pred_full, v247, v319, v899); + svint16_t v567 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v247, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v298, v300); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v298, v300); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v299, v301); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v299, v301); + svfloat32_t v455 = svadd_f32_x(svptrue_b32(), v454, v400); + svfloat32_t v456 = svsub_f32_x(svptrue_b32(), v454, v400); + svfloat32_t v508 = svadd_f32_x(svptrue_b32(), v297, v369); + svint16_t v513 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v297, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v563 = svadd_f32_x(svptrue_b32(), v562, v407); + svfloat32_t v564 = svsub_f32_x(svptrue_b32(), v562, v407); + svfloat32_t v616 = svadd_f32_x(svptrue_b32(), v296, v368); + svint16_t v621 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v296, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v918), svreinterpret_u64_s16(v459)); + svst1w_u64(pred_full, (unsigned *)(v1026), svreinterpret_u64_s16(v567)); + svint16_t v467 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v456, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v475 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v455, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v481 = svadd_f32_x(svptrue_b32(), v303, v375); + svint16_t v486 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v303, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v509 = svadd_f32_x(svptrue_b32(), v508, v445); + svfloat32_t v510 = svsub_f32_x(svptrue_b32(), v508, v445); + svfloat32_t v535 = svadd_f32_x(svptrue_b32(), v304, v376); + svint16_t v540 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v304, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v575 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v564, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v583 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v563, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v589 = svadd_f32_x(svptrue_b32(), v305, v377); + svint16_t v594 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v305, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v617 = svadd_f32_x(svptrue_b32(), v616, v444); + svfloat32_t v618 = svsub_f32_x(svptrue_b32(), v616, v444); + svfloat32_t v643 = svadd_f32_x(svptrue_b32(), v302, v374); + svint16_t v648 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v302, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v972), svreinterpret_u64_s16(v513)); + svst1w_u64(pred_full, (unsigned *)(v1080), svreinterpret_u64_s16(v621)); + svfloat32_t v482 = svadd_f32_x(svptrue_b32(), v481, v451); + svfloat32_t v483 = svsub_f32_x(svptrue_b32(), v481, v451); + svint16_t v521 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v510, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v529 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v509, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v536 = svadd_f32_x(svptrue_b32(), v535, v452); + svfloat32_t v537 = svsub_f32_x(svptrue_b32(), v535, v452); + svfloat32_t v590 = svadd_f32_x(svptrue_b32(), v589, v453); + svfloat32_t v591 = svsub_f32_x(svptrue_b32(), v589, v453); + svint16_t v629 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v618, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v637 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v617, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v644 = svadd_f32_x(svptrue_b32(), v643, v450); + svfloat32_t v645 = svsub_f32_x(svptrue_b32(), v643, v450); + svst1w_u64(pred_full, (unsigned *)(v927), svreinterpret_u64_s16(v467)); + svst1w_u64(pred_full, (unsigned *)(v936), svreinterpret_u64_s16(v475)); + svst1w_u64(pred_full, (unsigned *)(v945), svreinterpret_u64_s16(v486)); + svst1w_u64(pred_full, (unsigned *)(v999), svreinterpret_u64_s16(v540)); + svst1w_u64(pred_full, (unsigned *)(v1035), svreinterpret_u64_s16(v575)); + svst1w_u64(pred_full, (unsigned *)(v1044), svreinterpret_u64_s16(v583)); + svst1w_u64(pred_full, (unsigned *)(v1053), svreinterpret_u64_s16(v594)); + svst1w_u64(pred_full, (unsigned *)(v1107), svreinterpret_u64_s16(v648)); + svint16_t v494 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v483, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v502 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v482, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v548 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v537, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v556 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v536, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v602 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v591, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v610 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v590, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v656 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v645, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v664 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v644, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v981), svreinterpret_u64_s16(v521)); + svst1w_u64(pred_full, (unsigned *)(v990), svreinterpret_u64_s16(v529)); + svst1w_u64(pred_full, (unsigned *)(v1089), svreinterpret_u64_s16(v629)); + svst1w_u64(pred_full, (unsigned *)(v1098), svreinterpret_u64_s16(v637)); + svst1w_u64(pred_full, (unsigned *)(v954), svreinterpret_u64_s16(v494)); + svst1w_u64(pred_full, (unsigned *)(v963), svreinterpret_u64_s16(v502)); + svst1w_u64(pred_full, (unsigned *)(v1008), svreinterpret_u64_s16(v548)); + svst1w_u64(pred_full, (unsigned *)(v1017), svreinterpret_u64_s16(v556)); + svst1w_u64(pred_full, (unsigned *)(v1062), svreinterpret_u64_s16(v602)); + svst1w_u64(pred_full, (unsigned *)(v1071), svreinterpret_u64_s16(v610)); + svst1w_u64(pred_full, (unsigned *)(v1116), svreinterpret_u64_s16(v656)); + svst1w_u64(pred_full, (unsigned *)(v1125), svreinterpret_u64_s16(v664)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu25(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v164 = vld1s_s16(&v5[istride]); + float v881 = 9.6858316112863108e-01F; + float v884 = -2.4868988716485479e-01F; + float v885 = 2.4868988716485479e-01F; + float v1025 = 8.7630668004386358e-01F; + float v1028 = -4.8175367410171532e-01F; + float v1029 = 4.8175367410171532e-01F; + float v1169 = 7.2896862742141155e-01F; + float v1172 = -6.8454710592868862e-01F; + float v1173 = 6.8454710592868862e-01F; + float v1181 = 6.2790519529313527e-02F; + float v1184 = -9.9802672842827156e-01F; + float v1185 = 9.9802672842827156e-01F; + float v1313 = 5.3582679497899655e-01F; + float v1316 = -8.4432792550201508e-01F; + float v1317 = 8.4432792550201508e-01F; + float v1325 = -4.2577929156507272e-01F; + float v1328 = -9.0482705246601947e-01F; + float v1329 = 9.0482705246601947e-01F; + float v1337 = -6.3742398974868952e-01F; + float v1340 = 7.7051324277578936e-01F; + float v1341 = -7.7051324277578936e-01F; + float v1355 = -9.9211470131447776e-01F; + float v1358 = -1.2533323356430454e-01F; + float v1359 = 1.2533323356430454e-01F; + float v1375 = 2.5000000000000000e-01F; + float v1385 = 5.5901699437494745e-01F; + float v1395 = 6.1803398874989490e-01F; + float v1420 = 9.5105651629515353e-01F; + float v1421 = -9.5105651629515353e-01F; + float32x2_t v1423 = (float32x2_t){v4, v4}; + float v1446 = 2.0000000000000000e+00F; + int16x4_t v20 = vld1s_s16(&v5[0]); + float32x2_t v165 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v164)), 15); + float32x2_t v882 = (float32x2_t){v881, v881}; + float32x2_t v886 = (float32x2_t){v884, v885}; + float32x2_t v1026 = (float32x2_t){v1025, v1025}; + float32x2_t v1030 = (float32x2_t){v1028, v1029}; + float32x2_t v1170 = (float32x2_t){v1169, v1169}; + float32x2_t v1174 = (float32x2_t){v1172, v1173}; + float32x2_t v1182 = (float32x2_t){v1181, v1181}; + float32x2_t v1186 = (float32x2_t){v1184, v1185}; + float32x2_t v1216 = (float32x2_t){v1341, v1340}; + float32x2_t v1314 = (float32x2_t){v1313, v1313}; + float32x2_t v1318 = (float32x2_t){v1316, v1317}; + float32x2_t v1326 = (float32x2_t){v1325, v1325}; + float32x2_t v1330 = (float32x2_t){v1328, v1329}; + float32x2_t v1338 = (float32x2_t){v1337, v1337}; + float32x2_t v1342 = (float32x2_t){v1340, v1341}; + float32x2_t v1356 = (float32x2_t){v1355, v1355}; + float32x2_t v1360 = (float32x2_t){v1358, v1359}; + float32x2_t v1376 = (float32x2_t){v1375, v1375}; + float32x2_t v1386 = (float32x2_t){v1385, v1385}; + float32x2_t v1396 = (float32x2_t){v1395, v1395}; + float32x2_t v1422 = (float32x2_t){v1420, v1421}; + float32x2_t v1447 = (float32x2_t){v1446, v1446}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v26 = vld1s_s16(&v5[istride * 5]); + int16x4_t v32 = vld1s_s16(&v5[istride * 10]); + int16x4_t v38 = vld1s_s16(&v5[istride * 15]); + int16x4_t v44 = vld1s_s16(&v5[istride * 20]); + int16x4_t v170 = vld1s_s16(&v5[istride * 6]); + int16x4_t v176 = vld1s_s16(&v5[istride * 11]); + int16x4_t v182 = vld1s_s16(&v5[istride * 16]); + int16x4_t v188 = vld1s_s16(&v5[istride * 21]); + int16x4_t v308 = vld1s_s16(&v5[istride * 2]); + int16x4_t v314 = vld1s_s16(&v5[istride * 7]); + int16x4_t v320 = vld1s_s16(&v5[istride * 12]); + int16x4_t v326 = vld1s_s16(&v5[istride * 17]); + int16x4_t v332 = vld1s_s16(&v5[istride * 22]); + int16x4_t v452 = vld1s_s16(&v5[istride * 3]); + int16x4_t v458 = vld1s_s16(&v5[istride * 8]); + int16x4_t v464 = vld1s_s16(&v5[istride * 13]); + int16x4_t v470 = vld1s_s16(&v5[istride * 18]); + int16x4_t v476 = vld1s_s16(&v5[istride * 23]); + int16x4_t v596 = vld1s_s16(&v5[istride * 4]); + int16x4_t v602 = vld1s_s16(&v5[istride * 9]); + int16x4_t v608 = vld1s_s16(&v5[istride * 14]); + int16x4_t v614 = vld1s_s16(&v5[istride * 19]); + int16x4_t v620 = vld1s_s16(&v5[istride * 24]); + float32x2_t v888 = vmul_f32(v1423, v886); + float32x2_t v1032 = vmul_f32(v1423, v1030); + float32x2_t v1176 = vmul_f32(v1423, v1174); + float32x2_t v1188 = vmul_f32(v1423, v1186); + float32x2_t v1218 = vmul_f32(v1423, v1216); + float32x2_t v1320 = vmul_f32(v1423, v1318); + float32x2_t v1332 = vmul_f32(v1423, v1330); + float32x2_t v1344 = vmul_f32(v1423, v1342); + float32x2_t v1362 = vmul_f32(v1423, v1360); + float32x2_t v1424 = vmul_f32(v1423, v1422); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v33 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v32)), 15); + float32x2_t v39 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v38)), 15); + float32x2_t v45 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v44)), 15); + float32x2_t v171 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v170)), 15); + float32x2_t v177 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v176)), 15); + float32x2_t v183 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v182)), 15); + float32x2_t v189 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v188)), 15); + float32x2_t v309 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v308)), 15); + float32x2_t v315 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v314)), 15); + float32x2_t v321 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v320)), 15); + float32x2_t v327 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v326)), 15); + float32x2_t v333 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v332)), 15); + float32x2_t v453 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v452)), 15); + float32x2_t v459 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v458)), 15); + float32x2_t v465 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v464)), 15); + float32x2_t v471 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v470)), 15); + float32x2_t v477 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v476)), 15); + float32x2_t v597 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v596)), 15); + float32x2_t v603 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v602)), 15); + float32x2_t v609 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v608)), 15); + float32x2_t v615 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v614)), 15); + float32x2_t v621 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v620)), 15); + float32x2_t v82 = vsub_f32(v27, v45); + float32x2_t v86 = vmul_f32(v27, v1447); + float32x2_t v100 = vsub_f32(v33, v39); + float32x2_t v104 = vmul_f32(v33, v1447); + float32x2_t v226 = vsub_f32(v171, v189); + float32x2_t v230 = vmul_f32(v171, v1447); + float32x2_t v244 = vsub_f32(v177, v183); + float32x2_t v248 = vmul_f32(v177, v1447); + float32x2_t v370 = vsub_f32(v315, v333); + float32x2_t v374 = vmul_f32(v315, v1447); + float32x2_t v388 = vsub_f32(v321, v327); + float32x2_t v392 = vmul_f32(v321, v1447); + float32x2_t v514 = vsub_f32(v459, v477); + float32x2_t v518 = vmul_f32(v459, v1447); + float32x2_t v532 = vsub_f32(v465, v471); + float32x2_t v536 = vmul_f32(v465, v1447); + float32x2_t v658 = vsub_f32(v603, v621); + float32x2_t v662 = vmul_f32(v603, v1447); + float32x2_t v676 = vsub_f32(v609, v615); + float32x2_t v680 = vmul_f32(v609, v1447); + float32x2_t v87 = vsub_f32(v86, v82); + float32x2_t v105 = vsub_f32(v104, v100); + float32x2_t v116 = vmul_f32(v100, v1396); + float32x2_t v131 = vmul_f32(v82, v1396); + float32x2_t v231 = vsub_f32(v230, v226); + float32x2_t v249 = vsub_f32(v248, v244); + float32x2_t v260 = vmul_f32(v244, v1396); + float32x2_t v275 = vmul_f32(v226, v1396); + float32x2_t v375 = vsub_f32(v374, v370); + float32x2_t v393 = vsub_f32(v392, v388); + float32x2_t v404 = vmul_f32(v388, v1396); + float32x2_t v419 = vmul_f32(v370, v1396); + float32x2_t v519 = vsub_f32(v518, v514); + float32x2_t v537 = vsub_f32(v536, v532); + float32x2_t v548 = vmul_f32(v532, v1396); + float32x2_t v563 = vmul_f32(v514, v1396); + float32x2_t v663 = vsub_f32(v662, v658); + float32x2_t v681 = vsub_f32(v680, v676); + float32x2_t v692 = vmul_f32(v676, v1396); + float32x2_t v707 = vmul_f32(v658, v1396); + float32x2_t v106 = vadd_f32(v87, v105); + float32x2_t v107 = vsub_f32(v87, v105); + float32x2_t v117 = vadd_f32(v82, v116); + float32x2_t v132 = vsub_f32(v131, v100); + float32x2_t v250 = vadd_f32(v231, v249); + float32x2_t v251 = vsub_f32(v231, v249); + float32x2_t v261 = vadd_f32(v226, v260); + float32x2_t v276 = vsub_f32(v275, v244); + float32x2_t v394 = vadd_f32(v375, v393); + float32x2_t v395 = vsub_f32(v375, v393); + float32x2_t v405 = vadd_f32(v370, v404); + float32x2_t v420 = vsub_f32(v419, v388); + float32x2_t v538 = vadd_f32(v519, v537); + float32x2_t v539 = vsub_f32(v519, v537); + float32x2_t v549 = vadd_f32(v514, v548); + float32x2_t v564 = vsub_f32(v563, v532); + float32x2_t v682 = vadd_f32(v663, v681); + float32x2_t v683 = vsub_f32(v663, v681); + float32x2_t v693 = vadd_f32(v658, v692); + float32x2_t v708 = vsub_f32(v707, v676); + float32x2_t v111 = vmul_f32(v106, v1376); + float32x2_t v121 = vmul_f32(v107, v1386); + float32x2_t v133 = vadd_f32(v21, v106); + float32x2_t v139 = vrev64_f32(v117); + float32x2_t v147 = vrev64_f32(v132); + float32x2_t v255 = vmul_f32(v250, v1376); + float32x2_t v265 = vmul_f32(v251, v1386); + float32x2_t v277 = vadd_f32(v165, v250); + float32x2_t v283 = vrev64_f32(v261); + float32x2_t v291 = vrev64_f32(v276); + float32x2_t v399 = vmul_f32(v394, v1376); + float32x2_t v409 = vmul_f32(v395, v1386); + float32x2_t v421 = vadd_f32(v309, v394); + float32x2_t v427 = vrev64_f32(v405); + float32x2_t v435 = vrev64_f32(v420); + float32x2_t v543 = vmul_f32(v538, v1376); + float32x2_t v553 = vmul_f32(v539, v1386); + float32x2_t v565 = vadd_f32(v453, v538); + float32x2_t v571 = vrev64_f32(v549); + float32x2_t v579 = vrev64_f32(v564); + float32x2_t v687 = vmul_f32(v682, v1376); + float32x2_t v697 = vmul_f32(v683, v1386); + float32x2_t v709 = vadd_f32(v597, v682); + float32x2_t v715 = vrev64_f32(v693); + float32x2_t v723 = vrev64_f32(v708); + float32x2_t v112 = vsub_f32(v21, v111); + float32x2_t v140 = vmul_f32(v139, v1424); + float32x2_t v148 = vmul_f32(v147, v1424); + float32x2_t v256 = vsub_f32(v165, v255); + float32x2_t v284 = vmul_f32(v283, v1424); + float32x2_t v292 = vmul_f32(v291, v1424); + float32x2_t v400 = vsub_f32(v309, v399); + float32x2_t v428 = vmul_f32(v427, v1424); + float32x2_t v436 = vmul_f32(v435, v1424); + float32x2_t v544 = vsub_f32(v453, v543); + float32x2_t v572 = vmul_f32(v571, v1424); + float32x2_t v580 = vmul_f32(v579, v1424); + float32x2_t v688 = vsub_f32(v597, v687); + float32x2_t v716 = vmul_f32(v715, v1424); + float32x2_t v724 = vmul_f32(v723, v1424); + float32x2_t v772 = vsub_f32(v277, v709); + float32x2_t v776 = vmul_f32(v277, v1447); + float32x2_t v790 = vsub_f32(v421, v565); + float32x2_t v794 = vmul_f32(v421, v1447); + float32x2_t v122 = vsub_f32(v112, v121); + float32x2_t v126 = vmul_f32(v112, v1447); + float32x2_t v266 = vsub_f32(v256, v265); + float32x2_t v270 = vmul_f32(v256, v1447); + float32x2_t v410 = vsub_f32(v400, v409); + float32x2_t v414 = vmul_f32(v400, v1447); + float32x2_t v554 = vsub_f32(v544, v553); + float32x2_t v558 = vmul_f32(v544, v1447); + float32x2_t v698 = vsub_f32(v688, v697); + float32x2_t v702 = vmul_f32(v688, v1447); + float32x2_t v777 = vsub_f32(v776, v772); + float32x2_t v795 = vsub_f32(v794, v790); + float32x2_t v806 = vmul_f32(v790, v1396); + float32x2_t v821 = vmul_f32(v772, v1396); + float32x2_t v127 = vsub_f32(v126, v122); + float32x2_t v149 = vsub_f32(v122, v148); + float32x2_t v153 = vmul_f32(v122, v1447); + float32x2_t v271 = vsub_f32(v270, v266); + float32x2_t v293 = vsub_f32(v266, v292); + float32x2_t v297 = vmul_f32(v266, v1447); + float32x2_t v415 = vsub_f32(v414, v410); + float32x2_t v437 = vsub_f32(v410, v436); + float32x2_t v441 = vmul_f32(v410, v1447); + float32x2_t v559 = vsub_f32(v558, v554); + float32x2_t v581 = vsub_f32(v554, v580); + float32x2_t v585 = vmul_f32(v554, v1447); + float32x2_t v703 = vsub_f32(v702, v698); + float32x2_t v725 = vsub_f32(v698, v724); + float32x2_t v729 = vmul_f32(v698, v1447); + float32x2_t v796 = vadd_f32(v777, v795); + float32x2_t v797 = vsub_f32(v777, v795); + float32x2_t v807 = vadd_f32(v772, v806); + float32x2_t v822 = vsub_f32(v821, v790); + float32x2_t v141 = vsub_f32(v127, v140); + float32x2_t v154 = vsub_f32(v153, v149); + float32x2_t v158 = vmul_f32(v127, v1447); + float32x2_t v285 = vsub_f32(v271, v284); + float32x2_t v298 = vsub_f32(v297, v293); + float32x2_t v302 = vmul_f32(v271, v1447); + float32x2_t v429 = vsub_f32(v415, v428); + float32x2_t v442 = vsub_f32(v441, v437); + float32x2_t v446 = vmul_f32(v415, v1447); + float32x2_t v573 = vsub_f32(v559, v572); + float32x2_t v586 = vsub_f32(v585, v581); + float32x2_t v590 = vmul_f32(v559, v1447); + float32x2_t v717 = vsub_f32(v703, v716); + float32x2_t v730 = vsub_f32(v729, v725); + float32x2_t v734 = vmul_f32(v703, v1447); + float32x2_t v801 = vmul_f32(v796, v1376); + float32x2_t v811 = vmul_f32(v797, v1386); + float32x2_t v823 = vadd_f32(v133, v796); + float32x2_t v835 = vrev64_f32(v807); + float32x2_t v849 = vrev64_f32(v822); + float32x2_t v1033 = vrev64_f32(v293); + float32x2_t v1045 = vrev64_f32(v437); + float32x2_t v1057 = vrev64_f32(v725); + float32x2_t v1075 = vrev64_f32(v581); + float32x2_t v159 = vsub_f32(v158, v141); + float32x2_t v303 = vsub_f32(v302, v285); + float32x2_t v447 = vsub_f32(v446, v429); + float32x2_t v591 = vsub_f32(v590, v573); + float32x2_t v735 = vsub_f32(v734, v717); + float32x2_t v802 = vsub_f32(v133, v801); + int16x4_t v826 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v823, 15), (int32x2_t){0, 0})); + float32x2_t v836 = vmul_f32(v835, v1424); + float32x2_t v850 = vmul_f32(v849, v1424); + float32x2_t v889 = vrev64_f32(v285); + float32x2_t v901 = vrev64_f32(v429); + float32x2_t v913 = vrev64_f32(v717); + float32x2_t v931 = vrev64_f32(v573); + float32x2_t v1034 = vmul_f32(v1033, v1032); + float32x2_t v1046 = vmul_f32(v1045, v1320); + float32x2_t v1058 = vmul_f32(v1057, v1332); + float32x2_t v1076 = vmul_f32(v1075, v1188); + float32x2_t v1177 = vrev64_f32(v298); + float32x2_t v1189 = vrev64_f32(v442); + float32x2_t v1201 = vrev64_f32(v730); + float32x2_t v1219 = vrev64_f32(v586); + float32x2_t v812 = vsub_f32(v802, v811); + float32x2_t v816 = vmul_f32(v802, v1447); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v826), 0); + float32x2_t v890 = vmul_f32(v889, v888); + float32x2_t v902 = vmul_f32(v901, v1032); + float32x2_t v914 = vmul_f32(v913, v1320); + float32x2_t v932 = vmul_f32(v931, v1176); + float32x2_t v1035 = vfma_f32(v1034, v293, v1026); + float32x2_t v1047 = vfma_f32(v1046, v437, v1314); + float32x2_t v1059 = vfma_f32(v1058, v725, v1326); + float32x2_t v1077 = vfma_f32(v1076, v581, v1182); + float32x2_t v1178 = vmul_f32(v1177, v1176); + float32x2_t v1190 = vmul_f32(v1189, v1188); + float32x2_t v1202 = vmul_f32(v1201, v1362); + float32x2_t v1220 = vmul_f32(v1219, v1218); + float32x2_t v1321 = vrev64_f32(v303); + float32x2_t v1333 = vrev64_f32(v447); + float32x2_t v1345 = vrev64_f32(v735); + float32x2_t v1363 = vrev64_f32(v591); + float32x2_t v817 = vsub_f32(v816, v812); + float32x2_t v851 = vsub_f32(v812, v850); + float32x2_t v861 = vmul_f32(v812, v1447); + float32x2_t v891 = vfma_f32(v890, v285, v882); + float32x2_t v903 = vfma_f32(v902, v429, v1026); + float32x2_t v915 = vfma_f32(v914, v717, v1314); + float32x2_t v933 = vfma_f32(v932, v573, v1170); + float32x2_t v1060 = vsub_f32(v1035, v1059); + float32x2_t v1064 = vmul_f32(v1035, v1447); + float32x2_t v1078 = vsub_f32(v1047, v1077); + float32x2_t v1082 = vmul_f32(v1047, v1447); + float32x2_t v1179 = vfma_f32(v1178, v298, v1170); + float32x2_t v1191 = vfma_f32(v1190, v442, v1182); + float32x2_t v1203 = vfma_f32(v1202, v730, v1356); + float32x2_t v1221 = vfma_f32(v1220, v586, v1338); + float32x2_t v1322 = vmul_f32(v1321, v1320); + float32x2_t v1334 = vmul_f32(v1333, v1332); + float32x2_t v1346 = vmul_f32(v1345, v1344); + float32x2_t v1364 = vmul_f32(v1363, v1362); + float32x2_t v837 = vsub_f32(v817, v836); + int16x4_t v854 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v851, 15), (int32x2_t){0, 0})); + float32x2_t v862 = vsub_f32(v861, v851); + float32x2_t v872 = vmul_f32(v817, v1447); + float32x2_t v916 = vsub_f32(v891, v915); + float32x2_t v920 = vmul_f32(v891, v1447); + float32x2_t v934 = vsub_f32(v903, v933); + float32x2_t v938 = vmul_f32(v903, v1447); + float32x2_t v1065 = vsub_f32(v1064, v1060); + float32x2_t v1083 = vsub_f32(v1082, v1078); + float32x2_t v1094 = vmul_f32(v1078, v1396); + float32x2_t v1109 = vmul_f32(v1060, v1396); + float32x2_t v1204 = vsub_f32(v1179, v1203); + float32x2_t v1208 = vmul_f32(v1179, v1447); + float32x2_t v1222 = vsub_f32(v1191, v1221); + float32x2_t v1226 = vmul_f32(v1191, v1447); + float32x2_t v1323 = vfma_f32(v1322, v303, v1314); + float32x2_t v1335 = vfma_f32(v1334, v447, v1326); + float32x2_t v1347 = vfma_f32(v1346, v735, v1338); + float32x2_t v1365 = vfma_f32(v1364, v591, v1356); + int16x4_t v840 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v837, 15), (int32x2_t){0, 0})); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v854), 0); + int16x4_t v865 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v862, 15), (int32x2_t){0, 0})); + float32x2_t v873 = vsub_f32(v872, v837); + float32x2_t v921 = vsub_f32(v920, v916); + float32x2_t v939 = vsub_f32(v938, v934); + float32x2_t v950 = vmul_f32(v934, v1396); + float32x2_t v965 = vmul_f32(v916, v1396); + float32x2_t v1084 = vadd_f32(v1065, v1083); + float32x2_t v1085 = vsub_f32(v1065, v1083); + float32x2_t v1095 = vadd_f32(v1060, v1094); + float32x2_t v1110 = vsub_f32(v1109, v1078); + float32x2_t v1209 = vsub_f32(v1208, v1204); + float32x2_t v1227 = vsub_f32(v1226, v1222); + float32x2_t v1238 = vmul_f32(v1222, v1396); + float32x2_t v1253 = vmul_f32(v1204, v1396); + float32x2_t v1348 = vsub_f32(v1323, v1347); + float32x2_t v1352 = vmul_f32(v1323, v1447); + float32x2_t v1366 = vsub_f32(v1335, v1365); + float32x2_t v1370 = vmul_f32(v1335, v1447); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v840), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v865), 0); + int16x4_t v876 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v873, 15), (int32x2_t){0, 0})); + float32x2_t v940 = vadd_f32(v921, v939); + float32x2_t v941 = vsub_f32(v921, v939); + float32x2_t v951 = vadd_f32(v916, v950); + float32x2_t v966 = vsub_f32(v965, v934); + float32x2_t v1089 = vmul_f32(v1084, v1376); + float32x2_t v1099 = vmul_f32(v1085, v1386); + float32x2_t v1111 = vadd_f32(v149, v1084); + float32x2_t v1123 = vrev64_f32(v1095); + float32x2_t v1137 = vrev64_f32(v1110); + float32x2_t v1228 = vadd_f32(v1209, v1227); + float32x2_t v1229 = vsub_f32(v1209, v1227); + float32x2_t v1239 = vadd_f32(v1204, v1238); + float32x2_t v1254 = vsub_f32(v1253, v1222); + float32x2_t v1353 = vsub_f32(v1352, v1348); + float32x2_t v1371 = vsub_f32(v1370, v1366); + float32x2_t v1382 = vmul_f32(v1366, v1396); + float32x2_t v1397 = vmul_f32(v1348, v1396); + v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v876), 0); + float32x2_t v945 = vmul_f32(v940, v1376); + float32x2_t v955 = vmul_f32(v941, v1386); + float32x2_t v967 = vadd_f32(v141, v940); + float32x2_t v979 = vrev64_f32(v951); + float32x2_t v993 = vrev64_f32(v966); + float32x2_t v1090 = vsub_f32(v149, v1089); + int16x4_t v1114 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1111, 15), (int32x2_t){0, 0})); + float32x2_t v1124 = vmul_f32(v1123, v1424); + float32x2_t v1138 = vmul_f32(v1137, v1424); + float32x2_t v1233 = vmul_f32(v1228, v1376); + float32x2_t v1243 = vmul_f32(v1229, v1386); + float32x2_t v1255 = vadd_f32(v154, v1228); + float32x2_t v1267 = vrev64_f32(v1239); + float32x2_t v1281 = vrev64_f32(v1254); + float32x2_t v1372 = vadd_f32(v1353, v1371); + float32x2_t v1373 = vsub_f32(v1353, v1371); + float32x2_t v1383 = vadd_f32(v1348, v1382); + float32x2_t v1398 = vsub_f32(v1397, v1366); + float32x2_t v946 = vsub_f32(v141, v945); + int16x4_t v970 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v967, 15), (int32x2_t){0, 0})); + float32x2_t v980 = vmul_f32(v979, v1424); + float32x2_t v994 = vmul_f32(v993, v1424); + float32x2_t v1100 = vsub_f32(v1090, v1099); + float32x2_t v1104 = vmul_f32(v1090, v1447); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v1114), 0); + float32x2_t v1234 = vsub_f32(v154, v1233); + int16x4_t v1258 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1255, 15), (int32x2_t){0, 0})); + float32x2_t v1268 = vmul_f32(v1267, v1424); + float32x2_t v1282 = vmul_f32(v1281, v1424); + float32x2_t v1377 = vmul_f32(v1372, v1376); + float32x2_t v1387 = vmul_f32(v1373, v1386); + float32x2_t v1399 = vadd_f32(v159, v1372); + float32x2_t v1411 = vrev64_f32(v1383); + float32x2_t v1425 = vrev64_f32(v1398); + float32x2_t v956 = vsub_f32(v946, v955); + float32x2_t v960 = vmul_f32(v946, v1447); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v970), 0); + float32x2_t v1105 = vsub_f32(v1104, v1100); + float32x2_t v1139 = vsub_f32(v1100, v1138); + float32x2_t v1149 = vmul_f32(v1100, v1447); + float32x2_t v1244 = vsub_f32(v1234, v1243); + float32x2_t v1248 = vmul_f32(v1234, v1447); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v1258), 0); + float32x2_t v1378 = vsub_f32(v159, v1377); + int16x4_t v1402 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1399, 15), (int32x2_t){0, 0})); + float32x2_t v1412 = vmul_f32(v1411, v1424); + float32x2_t v1426 = vmul_f32(v1425, v1424); + float32x2_t v961 = vsub_f32(v960, v956); + float32x2_t v995 = vsub_f32(v956, v994); + float32x2_t v1005 = vmul_f32(v956, v1447); + float32x2_t v1125 = vsub_f32(v1105, v1124); + int16x4_t v1142 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1139, 15), (int32x2_t){0, 0})); + float32x2_t v1150 = vsub_f32(v1149, v1139); + float32x2_t v1160 = vmul_f32(v1105, v1447); + float32x2_t v1249 = vsub_f32(v1248, v1244); + float32x2_t v1283 = vsub_f32(v1244, v1282); + float32x2_t v1293 = vmul_f32(v1244, v1447); + float32x2_t v1388 = vsub_f32(v1378, v1387); + float32x2_t v1392 = vmul_f32(v1378, v1447); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v1402), 0); + float32x2_t v981 = vsub_f32(v961, v980); + int16x4_t v998 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v995, 15), (int32x2_t){0, 0})); + float32x2_t v1006 = vsub_f32(v1005, v995); + float32x2_t v1016 = vmul_f32(v961, v1447); + int16x4_t v1128 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1125, 15), (int32x2_t){0, 0})); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v1142), 0); + int16x4_t v1153 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1150, 15), (int32x2_t){0, 0})); + float32x2_t v1161 = vsub_f32(v1160, v1125); + float32x2_t v1269 = vsub_f32(v1249, v1268); + int16x4_t v1286 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1283, 15), (int32x2_t){0, 0})); + float32x2_t v1294 = vsub_f32(v1293, v1283); + float32x2_t v1304 = vmul_f32(v1249, v1447); + float32x2_t v1393 = vsub_f32(v1392, v1388); + float32x2_t v1427 = vsub_f32(v1388, v1426); + float32x2_t v1437 = vmul_f32(v1388, v1447); + int16x4_t v984 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v981, 15), (int32x2_t){0, 0})); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v998), 0); + int16x4_t v1009 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1006, 15), (int32x2_t){0, 0})); + float32x2_t v1017 = vsub_f32(v1016, v981); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v1128), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v1153), 0); + int16x4_t v1164 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1161, 15), (int32x2_t){0, 0})); + int16x4_t v1272 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1269, 15), (int32x2_t){0, 0})); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v1286), 0); + int16x4_t v1297 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1294, 15), (int32x2_t){0, 0})); + float32x2_t v1305 = vsub_f32(v1304, v1269); + float32x2_t v1413 = vsub_f32(v1393, v1412); + int16x4_t v1430 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1427, 15), (int32x2_t){0, 0})); + float32x2_t v1438 = vsub_f32(v1437, v1427); + float32x2_t v1448 = vmul_f32(v1393, v1447); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v984), 0); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v1009), 0); + int16x4_t v1020 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1017, 15), (int32x2_t){0, 0})); + v6[ostride * 22] = vget_lane_s32(vreinterpret_s32_s16(v1164), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v1272), 0); + v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v1297), 0); + int16x4_t v1308 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1305, 15), (int32x2_t){0, 0})); + int16x4_t v1416 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1413, 15), (int32x2_t){0, 0})); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v1430), 0); + int16x4_t v1441 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1438, 15), (int32x2_t){0, 0})); + float32x2_t v1449 = vsub_f32(v1448, v1413); + v6[ostride * 21] = vget_lane_s32(vreinterpret_s32_s16(v1020), 0); + v6[ostride * 23] = vget_lane_s32(vreinterpret_s32_s16(v1308), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v1416), 0); + v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v1441), 0); + int16x4_t v1452 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1449, 15), (int32x2_t){0, 0})); + v6[ostride * 24] = vget_lane_s32(vreinterpret_s32_s16(v1452), 0); + v5 += 1 * 1; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu25(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * 1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v1021 = 9.6858316112863108e-01F; + float v1026 = 2.4868988716485479e-01F; + float v1188 = 8.7630668004386358e-01F; + float v1193 = 4.8175367410171532e-01F; + float v1355 = 7.2896862742141155e-01F; + float v1360 = 6.8454710592868862e-01F; + float v1368 = 6.2790519529313527e-02F; + float v1373 = 9.9802672842827156e-01F; + float v1406 = 7.7051324277578925e-01F; + float v1522 = 5.3582679497899655e-01F; + float v1527 = 8.4432792550201508e-01F; + float v1535 = -4.2577929156507272e-01F; + float v1540 = 9.0482705246601947e-01F; + float v1548 = -6.3742398974868952e-01F; + float v1553 = -7.7051324277578936e-01F; + float v1568 = -9.9211470131447776e-01F; + float v1573 = 1.2533323356430454e-01F; + float v1590 = 2.5000000000000000e-01F; + float v1602 = 5.5901699437494745e-01F; + float v1614 = 6.1803398874989490e-01F; + float v1645 = -9.5105651629515353e-01F; + float v1675 = 2.0000000000000000e+00F; + const int32_t *v1759 = &v5[v0]; + int32_t *v2095 = &v6[v2]; + int64_t v27 = v0 * 5; + int64_t v35 = v0 * 10; + int64_t v43 = v0 * 15; + int64_t v51 = v0 * 20; + int64_t v194 = v0 * 6; + int64_t v202 = v0 * 11; + int64_t v210 = v0 * 16; + int64_t v218 = v0 * 21; + int64_t v353 = v0 * 2; + int64_t v361 = v0 * 7; + int64_t v369 = v0 * 12; + int64_t v377 = v0 * 17; + int64_t v385 = v0 * 22; + int64_t v520 = v0 * 3; + int64_t v528 = v0 * 8; + int64_t v536 = v0 * 13; + int64_t v544 = v0 * 18; + int64_t v552 = v0 * 23; + int64_t v687 = v0 * 4; + int64_t v695 = v0 * 9; + int64_t v703 = v0 * 14; + int64_t v711 = v0 * 19; + int64_t v719 = v0 * 24; + int64_t v969 = v2 * 5; + int64_t v985 = v2 * 10; + int64_t v999 = v2 * 15; + int64_t v1013 = v2 * 20; + float v1029 = v4 * v1026; + int64_t v1136 = v2 * 6; + int64_t v1152 = v2 * 11; + int64_t v1166 = v2 * 16; + int64_t v1180 = v2 * 21; + float v1196 = v4 * v1193; + int64_t v1287 = v2 * 2; + int64_t v1303 = v2 * 7; + int64_t v1319 = v2 * 12; + int64_t v1333 = v2 * 17; + int64_t v1347 = v2 * 22; + float v1363 = v4 * v1360; + float v1376 = v4 * v1373; + float v1409 = v4 * v1406; + int64_t v1454 = v2 * 3; + int64_t v1470 = v2 * 8; + int64_t v1486 = v2 * 13; + int64_t v1500 = v2 * 18; + int64_t v1514 = v2 * 23; + float v1530 = v4 * v1527; + float v1543 = v4 * v1540; + float v1556 = v4 * v1553; + float v1576 = v4 * v1573; + int64_t v1621 = v2 * 4; + int64_t v1637 = v2 * 9; + float v1648 = v4 * v1645; + int64_t v1653 = v2 * 14; + int64_t v1667 = v2 * 19; + int64_t v1681 = v2 * 24; + const int32_t *v1695 = &v5[0]; + svfloat32_t v2017 = svdup_n_f32(0); + int32_t *v2031 = &v6[0]; + svfloat32_t v2074 = svdup_n_f32(v1021); + svfloat32_t v2138 = svdup_n_f32(v1188); + svfloat32_t v2202 = svdup_n_f32(v1355); + svfloat32_t v2204 = svdup_n_f32(v1368); + svfloat32_t v2266 = svdup_n_f32(v1522); + svfloat32_t v2268 = svdup_n_f32(v1535); + svfloat32_t v2270 = svdup_n_f32(v1548); + svfloat32_t v2273 = svdup_n_f32(v1568); + svfloat32_t v2276 = svdup_n_f32(v1590); + svfloat32_t v2278 = svdup_n_f32(v1602); + svfloat32_t v2280 = svdup_n_f32(v1614); + svfloat32_t v2320 = svdup_n_f32(v1675); + svfloat32_t v192 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1759[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v1704 = &v5[v27]; + const int32_t *v1713 = &v5[v35]; + const int32_t *v1722 = &v5[v43]; + const int32_t *v1731 = &v5[v51]; + const int32_t *v1768 = &v5[v194]; + const int32_t *v1777 = &v5[v202]; + const int32_t *v1786 = &v5[v210]; + const int32_t *v1795 = &v5[v218]; + const int32_t *v1823 = &v5[v353]; + const int32_t *v1832 = &v5[v361]; + const int32_t *v1841 = &v5[v369]; + const int32_t *v1850 = &v5[v377]; + const int32_t *v1859 = &v5[v385]; + const int32_t *v1887 = &v5[v520]; + const int32_t *v1896 = &v5[v528]; + const int32_t *v1905 = &v5[v536]; + const int32_t *v1914 = &v5[v544]; + const int32_t *v1923 = &v5[v552]; + const int32_t *v1951 = &v5[v687]; + const int32_t *v1960 = &v5[v695]; + const int32_t *v1969 = &v5[v703]; + const int32_t *v1978 = &v5[v711]; + const int32_t *v1987 = &v5[v719]; + int32_t *v2041 = &v6[v969]; + int32_t *v2051 = &v6[v985]; + int32_t *v2061 = &v6[v999]; + int32_t *v2071 = &v6[v1013]; + svfloat32_t v2075 = svdup_n_f32(v1029); + int32_t *v2105 = &v6[v1136]; + int32_t *v2115 = &v6[v1152]; + int32_t *v2125 = &v6[v1166]; + int32_t *v2135 = &v6[v1180]; + svfloat32_t v2139 = svdup_n_f32(v1196); + int32_t *v2159 = &v6[v1287]; + int32_t *v2169 = &v6[v1303]; + int32_t *v2179 = &v6[v1319]; + int32_t *v2189 = &v6[v1333]; + int32_t *v2199 = &v6[v1347]; + svfloat32_t v2203 = svdup_n_f32(v1363); + svfloat32_t v2205 = svdup_n_f32(v1376); + svfloat32_t v2210 = svdup_n_f32(v1409); + int32_t *v2223 = &v6[v1454]; + int32_t *v2233 = &v6[v1470]; + int32_t *v2243 = &v6[v1486]; + int32_t *v2253 = &v6[v1500]; + int32_t *v2263 = &v6[v1514]; + svfloat32_t v2267 = svdup_n_f32(v1530); + svfloat32_t v2269 = svdup_n_f32(v1543); + svfloat32_t v2271 = svdup_n_f32(v1556); + svfloat32_t v2274 = svdup_n_f32(v1576); + int32_t *v2287 = &v6[v1621]; + int32_t *v2297 = &v6[v1637]; + svfloat32_t v2300 = svdup_n_f32(v1648); + int32_t *v2307 = &v6[v1653]; + int32_t *v2317 = &v6[v1667]; + int32_t *v2327 = &v6[v1681]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1695[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1704[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v41 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1713[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v49 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1722[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v57 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1731[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v200 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1768[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v208 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1777[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v216 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1786[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v224 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1795[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v359 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1823[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v367 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1832[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v375 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1841[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v383 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1850[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v391 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1859[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v526 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1887[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v534 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1896[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v542 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1905[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v550 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1914[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v558 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1923[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v693 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1951[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v701 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1960[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v709 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1969[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v717 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1978[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v725 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1987[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v70 = svcmla_f32_x(pred_full, v33, v2017, v33, 90); + svfloat32_t v83 = svcmla_f32_x(pred_full, v41, v2017, v41, 90); + svfloat32_t v96 = svcmla_f32_x(pred_full, v57, v2017, v57, 90); + svfloat32_t v116 = svcmla_f32_x(pred_full, v49, v2017, v49, 90); + svfloat32_t v237 = svcmla_f32_x(pred_full, v200, v2017, v200, 90); + svfloat32_t v250 = svcmla_f32_x(pred_full, v208, v2017, v208, 90); + svfloat32_t v263 = svcmla_f32_x(pred_full, v224, v2017, v224, 90); + svfloat32_t v283 = svcmla_f32_x(pred_full, v216, v2017, v216, 90); + svfloat32_t v404 = svcmla_f32_x(pred_full, v367, v2017, v367, 90); + svfloat32_t v417 = svcmla_f32_x(pred_full, v375, v2017, v375, 90); + svfloat32_t v430 = svcmla_f32_x(pred_full, v391, v2017, v391, 90); + svfloat32_t v450 = svcmla_f32_x(pred_full, v383, v2017, v383, 90); + svfloat32_t v571 = svcmla_f32_x(pred_full, v534, v2017, v534, 90); + svfloat32_t v584 = svcmla_f32_x(pred_full, v542, v2017, v542, 90); + svfloat32_t v597 = svcmla_f32_x(pred_full, v558, v2017, v558, 90); + svfloat32_t v617 = svcmla_f32_x(pred_full, v550, v2017, v550, 90); + svfloat32_t v738 = svcmla_f32_x(pred_full, v701, v2017, v701, 90); + svfloat32_t v751 = svcmla_f32_x(pred_full, v709, v2017, v709, 90); + svfloat32_t v764 = svcmla_f32_x(pred_full, v725, v2017, v725, 90); + svfloat32_t v784 = svcmla_f32_x(pred_full, v717, v2017, v717, 90); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v70, v96); + svfloat32_t v117 = svsub_f32_x(svptrue_b32(), v83, v116); + svfloat32_t v264 = svsub_f32_x(svptrue_b32(), v237, v263); + svfloat32_t v284 = svsub_f32_x(svptrue_b32(), v250, v283); + svfloat32_t v431 = svsub_f32_x(svptrue_b32(), v404, v430); + svfloat32_t v451 = svsub_f32_x(svptrue_b32(), v417, v450); + svfloat32_t v598 = svsub_f32_x(svptrue_b32(), v571, v597); + svfloat32_t v618 = svsub_f32_x(svptrue_b32(), v584, v617); + svfloat32_t v765 = svsub_f32_x(svptrue_b32(), v738, v764); + svfloat32_t v785 = svsub_f32_x(svptrue_b32(), v751, v784); + svfloat32_t v103 = svnmls_f32_x(pred_full, v97, v70, v2320); + svfloat32_t v123 = svnmls_f32_x(pred_full, v117, v83, v2320); + svfloat32_t v270 = svnmls_f32_x(pred_full, v264, v237, v2320); + svfloat32_t v290 = svnmls_f32_x(pred_full, v284, v250, v2320); + svfloat32_t v437 = svnmls_f32_x(pred_full, v431, v404, v2320); + svfloat32_t v457 = svnmls_f32_x(pred_full, v451, v417, v2320); + svfloat32_t v604 = svnmls_f32_x(pred_full, v598, v571, v2320); + svfloat32_t v624 = svnmls_f32_x(pred_full, v618, v584, v2320); + svfloat32_t v771 = svnmls_f32_x(pred_full, v765, v738, v2320); + svfloat32_t v791 = svnmls_f32_x(pred_full, v785, v751, v2320); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v103, v123); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v103, v123); + svfloat32_t v137 = svmla_f32_x(pred_full, v97, v117, v2280); + svfloat32_t v155 = svnmls_f32_x(pred_full, v117, v97, v2280); + svfloat32_t v291 = svadd_f32_x(svptrue_b32(), v270, v290); + svfloat32_t v292 = svsub_f32_x(svptrue_b32(), v270, v290); + svfloat32_t v304 = svmla_f32_x(pred_full, v264, v284, v2280); + svfloat32_t v322 = svnmls_f32_x(pred_full, v284, v264, v2280); + svfloat32_t v458 = svadd_f32_x(svptrue_b32(), v437, v457); + svfloat32_t v459 = svsub_f32_x(svptrue_b32(), v437, v457); + svfloat32_t v471 = svmla_f32_x(pred_full, v431, v451, v2280); + svfloat32_t v489 = svnmls_f32_x(pred_full, v451, v431, v2280); + svfloat32_t v625 = svadd_f32_x(svptrue_b32(), v604, v624); + svfloat32_t v626 = svsub_f32_x(svptrue_b32(), v604, v624); + svfloat32_t v638 = svmla_f32_x(pred_full, v598, v618, v2280); + svfloat32_t v656 = svnmls_f32_x(pred_full, v618, v598, v2280); + svfloat32_t v792 = svadd_f32_x(svptrue_b32(), v771, v791); + svfloat32_t v793 = svsub_f32_x(svptrue_b32(), v771, v791); + svfloat32_t v805 = svmla_f32_x(pred_full, v765, v785, v2280); + svfloat32_t v823 = svnmls_f32_x(pred_full, v785, v765, v2280); + svfloat32_t v156 = svadd_f32_x(svptrue_b32(), v25, v124); + svfloat32_t zero163 = svdup_n_f32(0); + svfloat32_t v163 = svcmla_f32_x(pred_full, zero163, v2300, v137, 90); + svfloat32_t zero171 = svdup_n_f32(0); + svfloat32_t v171 = svcmla_f32_x(pred_full, zero171, v2300, v155, 90); + svfloat32_t v323 = svadd_f32_x(svptrue_b32(), v192, v291); + svfloat32_t zero330 = svdup_n_f32(0); + svfloat32_t v330 = svcmla_f32_x(pred_full, zero330, v2300, v304, 90); + svfloat32_t zero338 = svdup_n_f32(0); + svfloat32_t v338 = svcmla_f32_x(pred_full, zero338, v2300, v322, 90); + svfloat32_t v490 = svadd_f32_x(svptrue_b32(), v359, v458); + svfloat32_t zero497 = svdup_n_f32(0); + svfloat32_t v497 = svcmla_f32_x(pred_full, zero497, v2300, v471, 90); + svfloat32_t zero505 = svdup_n_f32(0); + svfloat32_t v505 = svcmla_f32_x(pred_full, zero505, v2300, v489, 90); + svfloat32_t v657 = svadd_f32_x(svptrue_b32(), v526, v625); + svfloat32_t zero664 = svdup_n_f32(0); + svfloat32_t v664 = svcmla_f32_x(pred_full, zero664, v2300, v638, 90); + svfloat32_t zero672 = svdup_n_f32(0); + svfloat32_t v672 = svcmla_f32_x(pred_full, zero672, v2300, v656, 90); + svfloat32_t v824 = svadd_f32_x(svptrue_b32(), v693, v792); + svfloat32_t zero831 = svdup_n_f32(0); + svfloat32_t v831 = svcmla_f32_x(pred_full, zero831, v2300, v805, 90); + svfloat32_t zero839 = svdup_n_f32(0); + svfloat32_t v839 = svcmla_f32_x(pred_full, zero839, v2300, v823, 90); + svfloat32_t v131 = svmls_f32_x(pred_full, v25, v124, v2276); + svfloat32_t v298 = svmls_f32_x(pred_full, v192, v291, v2276); + svfloat32_t v465 = svmls_f32_x(pred_full, v359, v458, v2276); + svfloat32_t v632 = svmls_f32_x(pred_full, v526, v625, v2276); + svfloat32_t v799 = svmls_f32_x(pred_full, v693, v792, v2276); + svfloat32_t v143 = svmls_f32_x(pred_full, v131, v125, v2278); + svfloat32_t v310 = svmls_f32_x(pred_full, v298, v292, v2278); + svfloat32_t v477 = svmls_f32_x(pred_full, v465, v459, v2278); + svfloat32_t v644 = svmls_f32_x(pred_full, v632, v626, v2278); + svfloat32_t v811 = svmls_f32_x(pred_full, v799, v793, v2278); + svfloat32_t v865 = svcmla_f32_x(pred_full, v323, v2017, v323, 90); + svfloat32_t v878 = svcmla_f32_x(pred_full, v490, v2017, v490, 90); + svfloat32_t v891 = svcmla_f32_x(pred_full, v824, v2017, v824, 90); + svfloat32_t v911 = svcmla_f32_x(pred_full, v657, v2017, v657, 90); + svfloat32_t v149 = svnmls_f32_x(pred_full, v143, v131, v2320); + svfloat32_t v172 = svsub_f32_x(svptrue_b32(), v143, v171); + svfloat32_t v316 = svnmls_f32_x(pred_full, v310, v298, v2320); + svfloat32_t v339 = svsub_f32_x(svptrue_b32(), v310, v338); + svfloat32_t v483 = svnmls_f32_x(pred_full, v477, v465, v2320); + svfloat32_t v506 = svsub_f32_x(svptrue_b32(), v477, v505); + svfloat32_t v650 = svnmls_f32_x(pred_full, v644, v632, v2320); + svfloat32_t v673 = svsub_f32_x(svptrue_b32(), v644, v672); + svfloat32_t v817 = svnmls_f32_x(pred_full, v811, v799, v2320); + svfloat32_t v840 = svsub_f32_x(svptrue_b32(), v811, v839); + svfloat32_t v892 = svsub_f32_x(svptrue_b32(), v865, v891); + svfloat32_t v912 = svsub_f32_x(svptrue_b32(), v878, v911); + svfloat32_t v164 = svsub_f32_x(svptrue_b32(), v149, v163); + svfloat32_t v178 = svnmls_f32_x(pred_full, v172, v143, v2320); + svfloat32_t v331 = svsub_f32_x(svptrue_b32(), v316, v330); + svfloat32_t v345 = svnmls_f32_x(pred_full, v339, v310, v2320); + svfloat32_t v498 = svsub_f32_x(svptrue_b32(), v483, v497); + svfloat32_t v512 = svnmls_f32_x(pred_full, v506, v477, v2320); + svfloat32_t v665 = svsub_f32_x(svptrue_b32(), v650, v664); + svfloat32_t v679 = svnmls_f32_x(pred_full, v673, v644, v2320); + svfloat32_t v832 = svsub_f32_x(svptrue_b32(), v817, v831); + svfloat32_t v846 = svnmls_f32_x(pred_full, v840, v811, v2320); + svfloat32_t v898 = svnmls_f32_x(pred_full, v892, v865, v2320); + svfloat32_t v918 = svnmls_f32_x(pred_full, v912, v878, v2320); + svfloat32_t v1191 = svmul_f32_x(svptrue_b32(), v339, v2138); + svfloat32_t v1204 = svmul_f32_x(svptrue_b32(), v506, v2266); + svfloat32_t v1217 = svmul_f32_x(svptrue_b32(), v840, v2268); + svfloat32_t v1237 = svmul_f32_x(svptrue_b32(), v673, v2204); + svfloat32_t v184 = svnmls_f32_x(pred_full, v164, v149, v2320); + svfloat32_t v351 = svnmls_f32_x(pred_full, v331, v316, v2320); + svfloat32_t v518 = svnmls_f32_x(pred_full, v498, v483, v2320); + svfloat32_t v685 = svnmls_f32_x(pred_full, v665, v650, v2320); + svfloat32_t v852 = svnmls_f32_x(pred_full, v832, v817, v2320); + svfloat32_t v919 = svadd_f32_x(svptrue_b32(), v898, v918); + svfloat32_t v920 = svsub_f32_x(svptrue_b32(), v898, v918); + svfloat32_t v932 = svmla_f32_x(pred_full, v892, v912, v2280); + svfloat32_t v950 = svnmls_f32_x(pred_full, v912, v892, v2280); + svfloat32_t v1024 = svmul_f32_x(svptrue_b32(), v331, v2074); + svfloat32_t v1037 = svmul_f32_x(svptrue_b32(), v498, v2138); + svfloat32_t v1050 = svmul_f32_x(svptrue_b32(), v832, v2266); + svfloat32_t v1070 = svmul_f32_x(svptrue_b32(), v665, v2202); + svfloat32_t v1199 = svcmla_f32_x(pred_full, v1191, v2139, v339, 90); + svfloat32_t v1212 = svcmla_f32_x(pred_full, v1204, v2267, v506, 90); + svfloat32_t v1225 = svcmla_f32_x(pred_full, v1217, v2269, v840, 90); + svfloat32_t v1245 = svcmla_f32_x(pred_full, v1237, v2205, v673, 90); + svfloat32_t v1358 = svmul_f32_x(svptrue_b32(), v345, v2202); + svfloat32_t v1371 = svmul_f32_x(svptrue_b32(), v512, v2204); + svfloat32_t v1384 = svmul_f32_x(svptrue_b32(), v846, v2273); + svfloat32_t v1404 = svmul_f32_x(svptrue_b32(), v679, v2270); + svfloat32_t v951 = svadd_f32_x(svptrue_b32(), v156, v919); + svfloat32_t zero966 = svdup_n_f32(0); + svfloat32_t v966 = svcmla_f32_x(pred_full, zero966, v2300, v932, 90); + svfloat32_t zero982 = svdup_n_f32(0); + svfloat32_t v982 = svcmla_f32_x(pred_full, zero982, v2300, v950, 90); + svfloat32_t v1032 = svcmla_f32_x(pred_full, v1024, v2075, v331, 90); + svfloat32_t v1045 = svcmla_f32_x(pred_full, v1037, v2139, v498, 90); + svfloat32_t v1058 = svcmla_f32_x(pred_full, v1050, v2267, v832, 90); + svfloat32_t v1078 = svcmla_f32_x(pred_full, v1070, v2203, v665, 90); + svfloat32_t v1226 = svsub_f32_x(svptrue_b32(), v1199, v1225); + svfloat32_t v1246 = svsub_f32_x(svptrue_b32(), v1212, v1245); + svfloat32_t v1366 = svcmla_f32_x(pred_full, v1358, v2203, v345, 90); + svfloat32_t v1379 = svcmla_f32_x(pred_full, v1371, v2205, v512, 90); + svfloat32_t v1392 = svcmla_f32_x(pred_full, v1384, v2274, v846, 90); + svfloat32_t v1412 = svcmla_f32_x(pred_full, v1404, v2210, v679, 90); + svfloat32_t v1525 = svmul_f32_x(svptrue_b32(), v351, v2266); + svfloat32_t v1538 = svmul_f32_x(svptrue_b32(), v518, v2268); + svfloat32_t v1551 = svmul_f32_x(svptrue_b32(), v852, v2270); + svfloat32_t v1571 = svmul_f32_x(svptrue_b32(), v685, v2273); + svfloat32_t v926 = svmls_f32_x(pred_full, v156, v919, v2276); + svint16_t v954 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v951, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v1059 = svsub_f32_x(svptrue_b32(), v1032, v1058); + svfloat32_t v1079 = svsub_f32_x(svptrue_b32(), v1045, v1078); + svfloat32_t v1232 = svnmls_f32_x(pred_full, v1226, v1199, v2320); + svfloat32_t v1252 = svnmls_f32_x(pred_full, v1246, v1212, v2320); + svfloat32_t v1393 = svsub_f32_x(svptrue_b32(), v1366, v1392); + svfloat32_t v1413 = svsub_f32_x(svptrue_b32(), v1379, v1412); + svfloat32_t v1533 = svcmla_f32_x(pred_full, v1525, v2267, v351, 90); + svfloat32_t v1546 = svcmla_f32_x(pred_full, v1538, v2269, v518, 90); + svfloat32_t v1559 = svcmla_f32_x(pred_full, v1551, v2271, v852, 90); + svfloat32_t v1579 = svcmla_f32_x(pred_full, v1571, v2274, v685, 90); + svfloat32_t v938 = svmls_f32_x(pred_full, v926, v920, v2278); + svfloat32_t v1065 = svnmls_f32_x(pred_full, v1059, v1032, v2320); + svfloat32_t v1085 = svnmls_f32_x(pred_full, v1079, v1045, v2320); + svfloat32_t v1253 = svadd_f32_x(svptrue_b32(), v1232, v1252); + svfloat32_t v1254 = svsub_f32_x(svptrue_b32(), v1232, v1252); + svfloat32_t v1266 = svmla_f32_x(pred_full, v1226, v1246, v2280); + svfloat32_t v1284 = svnmls_f32_x(pred_full, v1246, v1226, v2280); + svfloat32_t v1399 = svnmls_f32_x(pred_full, v1393, v1366, v2320); + svfloat32_t v1419 = svnmls_f32_x(pred_full, v1413, v1379, v2320); + svfloat32_t v1560 = svsub_f32_x(svptrue_b32(), v1533, v1559); + svfloat32_t v1580 = svsub_f32_x(svptrue_b32(), v1546, v1579); + svst1w_u64(pred_full, (unsigned *)(v2031), svreinterpret_u64_s16(v954)); + svfloat32_t v944 = svnmls_f32_x(pred_full, v938, v926, v2320); + svfloat32_t v983 = svsub_f32_x(svptrue_b32(), v938, v982); + svfloat32_t v1086 = svadd_f32_x(svptrue_b32(), v1065, v1085); + svfloat32_t v1087 = svsub_f32_x(svptrue_b32(), v1065, v1085); + svfloat32_t v1099 = svmla_f32_x(pred_full, v1059, v1079, v2280); + svfloat32_t v1117 = svnmls_f32_x(pred_full, v1079, v1059, v2280); + svfloat32_t v1285 = svadd_f32_x(svptrue_b32(), v172, v1253); + svfloat32_t zero1300 = svdup_n_f32(0); + svfloat32_t v1300 = svcmla_f32_x(pred_full, zero1300, v2300, v1266, 90); + svfloat32_t zero1316 = svdup_n_f32(0); + svfloat32_t v1316 = svcmla_f32_x(pred_full, zero1316, v2300, v1284, 90); + svfloat32_t v1420 = svadd_f32_x(svptrue_b32(), v1399, v1419); + svfloat32_t v1421 = svsub_f32_x(svptrue_b32(), v1399, v1419); + svfloat32_t v1433 = svmla_f32_x(pred_full, v1393, v1413, v2280); + svfloat32_t v1451 = svnmls_f32_x(pred_full, v1413, v1393, v2280); + svfloat32_t v1566 = svnmls_f32_x(pred_full, v1560, v1533, v2320); + svfloat32_t v1586 = svnmls_f32_x(pred_full, v1580, v1546, v2320); + svfloat32_t v967 = svsub_f32_x(svptrue_b32(), v944, v966); + svint16_t v986 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v983, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v997 = svnmls_f32_x(pred_full, v983, v938, v2320); + svfloat32_t v1118 = svadd_f32_x(svptrue_b32(), v164, v1086); + svfloat32_t zero1133 = svdup_n_f32(0); + svfloat32_t v1133 = svcmla_f32_x(pred_full, zero1133, v2300, v1099, 90); + svfloat32_t zero1149 = svdup_n_f32(0); + svfloat32_t v1149 = svcmla_f32_x(pred_full, zero1149, v2300, v1117, 90); + svfloat32_t v1260 = svmls_f32_x(pred_full, v172, v1253, v2276); + svint16_t v1288 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1285, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1452 = svadd_f32_x(svptrue_b32(), v178, v1420); + svfloat32_t zero1467 = svdup_n_f32(0); + svfloat32_t v1467 = svcmla_f32_x(pred_full, zero1467, v2300, v1433, 90); + svfloat32_t zero1483 = svdup_n_f32(0); + svfloat32_t v1483 = svcmla_f32_x(pred_full, zero1483, v2300, v1451, 90); + svfloat32_t v1587 = svadd_f32_x(svptrue_b32(), v1566, v1586); + svfloat32_t v1588 = svsub_f32_x(svptrue_b32(), v1566, v1586); + svfloat32_t v1600 = svmla_f32_x(pred_full, v1560, v1580, v2280); + svfloat32_t v1618 = svnmls_f32_x(pred_full, v1580, v1560, v2280); + svint16_t v970 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v967, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1000 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v997, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v1011 = svnmls_f32_x(pred_full, v967, v944, v2320); + svfloat32_t v1093 = svmls_f32_x(pred_full, v164, v1086, v2276); + svint16_t v1121 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1118, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1272 = svmls_f32_x(pred_full, v1260, v1254, v2278); + svfloat32_t v1427 = svmls_f32_x(pred_full, v178, v1420, v2276); + svint16_t v1455 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1452, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1619 = svadd_f32_x(svptrue_b32(), v184, v1587); + svfloat32_t zero1634 = svdup_n_f32(0); + svfloat32_t v1634 = svcmla_f32_x(pred_full, zero1634, v2300, v1600, 90); + svfloat32_t zero1650 = svdup_n_f32(0); + svfloat32_t v1650 = svcmla_f32_x(pred_full, zero1650, v2300, v1618, 90); + svst1w_u64(pred_full, (unsigned *)(v2051), svreinterpret_u64_s16(v986)); + svst1w_u64(pred_full, (unsigned *)(v2159), svreinterpret_u64_s16(v1288)); + svint16_t v1014 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1011, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1105 = svmls_f32_x(pred_full, v1093, v1087, v2278); + svfloat32_t v1278 = svnmls_f32_x(pred_full, v1272, v1260, v2320); + svfloat32_t v1317 = svsub_f32_x(svptrue_b32(), v1272, v1316); + svfloat32_t v1439 = svmls_f32_x(pred_full, v1427, v1421, v2278); + svfloat32_t v1594 = svmls_f32_x(pred_full, v184, v1587, v2276); + svint16_t v1622 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1619, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v2041), svreinterpret_u64_s16(v970)); + svst1w_u64(pred_full, (unsigned *)(v2061), svreinterpret_u64_s16(v1000)); + svst1w_u64(pred_full, (unsigned *)(v2095), svreinterpret_u64_s16(v1121)); + svst1w_u64(pred_full, (unsigned *)(v2223), svreinterpret_u64_s16(v1455)); + svfloat32_t v1111 = svnmls_f32_x(pred_full, v1105, v1093, v2320); + svfloat32_t v1150 = svsub_f32_x(svptrue_b32(), v1105, v1149); + svfloat32_t v1301 = svsub_f32_x(svptrue_b32(), v1278, v1300); + svint16_t v1320 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1317, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1331 = svnmls_f32_x(pred_full, v1317, v1272, v2320); + svfloat32_t v1445 = svnmls_f32_x(pred_full, v1439, v1427, v2320); + svfloat32_t v1484 = svsub_f32_x(svptrue_b32(), v1439, v1483); + svfloat32_t v1606 = svmls_f32_x(pred_full, v1594, v1588, v2278); + svst1w_u64(pred_full, (unsigned *)(v2071), svreinterpret_u64_s16(v1014)); + svst1w_u64(pred_full, (unsigned *)(v2287), svreinterpret_u64_s16(v1622)); + svfloat32_t v1134 = svsub_f32_x(svptrue_b32(), v1111, v1133); + svint16_t v1153 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1150, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1164 = svnmls_f32_x(pred_full, v1150, v1105, v2320); + svint16_t v1304 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1301, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1334 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1331, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1345 = svnmls_f32_x(pred_full, v1301, v1278, v2320); + svfloat32_t v1468 = svsub_f32_x(svptrue_b32(), v1445, v1467); + svint16_t v1487 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1484, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1498 = svnmls_f32_x(pred_full, v1484, v1439, v2320); + svfloat32_t v1612 = svnmls_f32_x(pred_full, v1606, v1594, v2320); + svfloat32_t v1651 = svsub_f32_x(svptrue_b32(), v1606, v1650); + svst1w_u64(pred_full, (unsigned *)(v2179), svreinterpret_u64_s16(v1320)); + svint16_t v1137 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1134, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1167 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1164, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1178 = svnmls_f32_x(pred_full, v1134, v1111, v2320); + svint16_t v1348 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1345, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1471 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1468, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1501 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1498, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1512 = svnmls_f32_x(pred_full, v1468, v1445, v2320); + svfloat32_t v1635 = svsub_f32_x(svptrue_b32(), v1612, v1634); + svint16_t v1654 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1651, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1665 = svnmls_f32_x(pred_full, v1651, v1606, v2320); + svst1w_u64(pred_full, (unsigned *)(v2115), svreinterpret_u64_s16(v1153)); + svst1w_u64(pred_full, (unsigned *)(v2169), svreinterpret_u64_s16(v1304)); + svst1w_u64(pred_full, (unsigned *)(v2189), svreinterpret_u64_s16(v1334)); + svst1w_u64(pred_full, (unsigned *)(v2243), svreinterpret_u64_s16(v1487)); + svint16_t v1181 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1178, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1515 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1512, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1638 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1635, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1668 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1665, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1679 = svnmls_f32_x(pred_full, v1635, v1612, v2320); + svst1w_u64(pred_full, (unsigned *)(v2105), svreinterpret_u64_s16(v1137)); + svst1w_u64(pred_full, (unsigned *)(v2125), svreinterpret_u64_s16(v1167)); + svst1w_u64(pred_full, (unsigned *)(v2199), svreinterpret_u64_s16(v1348)); + svst1w_u64(pred_full, (unsigned *)(v2233), svreinterpret_u64_s16(v1471)); + svst1w_u64(pred_full, (unsigned *)(v2253), svreinterpret_u64_s16(v1501)); + svst1w_u64(pred_full, (unsigned *)(v2307), svreinterpret_u64_s16(v1654)); + svint16_t v1682 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1679, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v2135), svreinterpret_u64_s16(v1181)); + svst1w_u64(pred_full, (unsigned *)(v2263), svreinterpret_u64_s16(v1515)); + svst1w_u64(pred_full, (unsigned *)(v2297), svreinterpret_u64_s16(v1638)); + svst1w_u64(pred_full, (unsigned *)(v2317), svreinterpret_u64_s16(v1668)); + svst1w_u64(pred_full, (unsigned *)(v2327), svreinterpret_u64_s16(v1682)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu32(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + for (int j = 0; j < howmany; j += 1) { + int16x4_t v339 = vld1s_s16(&v5[istride]); + float v790 = 7.0710678118654757e-01F; + float v801 = -7.0710678118654746e-01F; + float v851 = 5.5557023301960229e-01F; + float v865 = -1.9509032201612861e-01F; + float v916 = 9.2387953251128674e-01F; + float v923 = -9.2387953251128685e-01F; + float v926 = 3.8268343236508967e-01F; + float v927 = -3.8268343236508967e-01F; + float v973 = 1.9509032201612833e-01F; + float v976 = -9.8078528040323043e-01F; + float v977 = 9.8078528040323043e-01F; + float v984 = -5.5557023301960218e-01F; + float v987 = 8.3146961230254524e-01F; + float v988 = -8.3146961230254524e-01F; + float v998 = -1.0000000000000000e+00F; + float v999 = 1.0000000000000000e+00F; + float32x2_t v1001 = (float32x2_t){v4, v4}; + int16x4_t v20 = vld1s_s16(&v5[0]); + float32x2_t v340 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v339)), 15); + float32x2_t v608 = (float32x2_t){v977, v977}; + float32x2_t v669 = (float32x2_t){v916, v916}; + float32x2_t v673 = (float32x2_t){v927, v926}; + float32x2_t v730 = (float32x2_t){v987, v987}; + float32x2_t v734 = (float32x2_t){v984, v851}; + float32x2_t v741 = (float32x2_t){v865, v865}; + float32x2_t v791 = (float32x2_t){v790, v790}; + float32x2_t v802 = (float32x2_t){v801, v801}; + float32x2_t v806 = (float32x2_t){v999, v998}; + float32x2_t v852 = (float32x2_t){v851, v851}; + float32x2_t v856 = (float32x2_t){v988, v987}; + float32x2_t v863 = (float32x2_t){v976, v976}; + float32x2_t v867 = (float32x2_t){v865, v973}; + float32x2_t v913 = (float32x2_t){v926, v926}; + float32x2_t v917 = (float32x2_t){v923, v916}; + float32x2_t v924 = (float32x2_t){v923, v923}; + float32x2_t v928 = (float32x2_t){v926, v927}; + float32x2_t v974 = (float32x2_t){v973, v973}; + float32x2_t v978 = (float32x2_t){v976, v977}; + float32x2_t v985 = (float32x2_t){v984, v984}; + float32x2_t v989 = (float32x2_t){v987, v988}; + float32x2_t v1000 = (float32x2_t){v998, v999}; + float32x2_t v21 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v20)), 15); + int16x4_t v26 = vld1s_s16(&v5[istride * 16]); + int16x4_t v34 = vld1s_s16(&v5[istride * 8]); + int16x4_t v40 = vld1s_s16(&v5[istride * 24]); + int16x4_t v59 = vld1s_s16(&v5[istride * 4]); + int16x4_t v65 = vld1s_s16(&v5[istride * 20]); + int16x4_t v73 = vld1s_s16(&v5[istride * 12]); + int16x4_t v79 = vld1s_s16(&v5[istride * 28]); + int16x4_t v137 = vld1s_s16(&v5[istride * 2]); + int16x4_t v143 = vld1s_s16(&v5[istride * 18]); + int16x4_t v151 = vld1s_s16(&v5[istride * 10]); + int16x4_t v157 = vld1s_s16(&v5[istride * 26]); + int16x4_t v176 = vld1s_s16(&v5[istride * 6]); + int16x4_t v182 = vld1s_s16(&v5[istride * 22]); + int16x4_t v190 = vld1s_s16(&v5[istride * 14]); + int16x4_t v196 = vld1s_s16(&v5[istride * 30]); + int16x4_t v345 = vld1s_s16(&v5[istride * 17]); + int16x4_t v353 = vld1s_s16(&v5[istride * 9]); + int16x4_t v359 = vld1s_s16(&v5[istride * 25]); + int16x4_t v378 = vld1s_s16(&v5[istride * 5]); + int16x4_t v384 = vld1s_s16(&v5[istride * 21]); + int16x4_t v392 = vld1s_s16(&v5[istride * 13]); + int16x4_t v398 = vld1s_s16(&v5[istride * 29]); + int16x4_t v456 = vld1s_s16(&v5[istride * 3]); + int16x4_t v462 = vld1s_s16(&v5[istride * 19]); + int16x4_t v470 = vld1s_s16(&v5[istride * 11]); + int16x4_t v476 = vld1s_s16(&v5[istride * 27]); + int16x4_t v495 = vld1s_s16(&v5[istride * 7]); + int16x4_t v501 = vld1s_s16(&v5[istride * 23]); + int16x4_t v509 = vld1s_s16(&v5[istride * 15]); + int16x4_t v515 = vld1s_s16(&v5[istride * 31]); + float32x2_t v675 = vmul_f32(v1001, v673); + float32x2_t v736 = vmul_f32(v1001, v734); + float32x2_t v808 = vmul_f32(v1001, v806); + float32x2_t v858 = vmul_f32(v1001, v856); + float32x2_t v869 = vmul_f32(v1001, v867); + float32x2_t v919 = vmul_f32(v1001, v917); + float32x2_t v930 = vmul_f32(v1001, v928); + float32x2_t v980 = vmul_f32(v1001, v978); + float32x2_t v991 = vmul_f32(v1001, v989); + float32x2_t v1002 = vmul_f32(v1001, v1000); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v60 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v59)), 15); + float32x2_t v66 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v65)), 15); + float32x2_t v74 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v73)), 15); + float32x2_t v80 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v79)), 15); + float32x2_t v138 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v137)), 15); + float32x2_t v144 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v143)), 15); + float32x2_t v152 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v151)), 15); + float32x2_t v158 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v157)), 15); + float32x2_t v177 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v176)), 15); + float32x2_t v183 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v182)), 15); + float32x2_t v191 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v190)), 15); + float32x2_t v197 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v196)), 15); + float32x2_t v346 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v345)), 15); + float32x2_t v354 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v353)), 15); + float32x2_t v360 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v359)), 15); + float32x2_t v379 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v378)), 15); + float32x2_t v385 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v384)), 15); + float32x2_t v393 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v392)), 15); + float32x2_t v399 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v398)), 15); + float32x2_t v457 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v456)), 15); + float32x2_t v463 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v462)), 15); + float32x2_t v471 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v470)), 15); + float32x2_t v477 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v476)), 15); + float32x2_t v496 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v495)), 15); + float32x2_t v502 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v501)), 15); + float32x2_t v510 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v509)), 15); + float32x2_t v516 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v515)), 15); + float32x2_t v28 = vadd_f32(v21, v27); + float32x2_t v29 = vsub_f32(v21, v27); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v67 = vadd_f32(v60, v66); + float32x2_t v68 = vsub_f32(v60, v66); + float32x2_t v81 = vadd_f32(v74, v80); + float32x2_t v82 = vsub_f32(v74, v80); + float32x2_t v145 = vadd_f32(v138, v144); + float32x2_t v146 = vsub_f32(v138, v144); + float32x2_t v159 = vadd_f32(v152, v158); + float32x2_t v160 = vsub_f32(v152, v158); + float32x2_t v184 = vadd_f32(v177, v183); + float32x2_t v185 = vsub_f32(v177, v183); + float32x2_t v198 = vadd_f32(v191, v197); + float32x2_t v199 = vsub_f32(v191, v197); + float32x2_t v347 = vadd_f32(v340, v346); + float32x2_t v348 = vsub_f32(v340, v346); + float32x2_t v361 = vadd_f32(v354, v360); + float32x2_t v362 = vsub_f32(v354, v360); + float32x2_t v386 = vadd_f32(v379, v385); + float32x2_t v387 = vsub_f32(v379, v385); + float32x2_t v400 = vadd_f32(v393, v399); + float32x2_t v401 = vsub_f32(v393, v399); + float32x2_t v464 = vadd_f32(v457, v463); + float32x2_t v465 = vsub_f32(v457, v463); + float32x2_t v478 = vadd_f32(v471, v477); + float32x2_t v479 = vsub_f32(v471, v477); + float32x2_t v503 = vadd_f32(v496, v502); + float32x2_t v504 = vsub_f32(v496, v502); + float32x2_t v517 = vadd_f32(v510, v516); + float32x2_t v518 = vsub_f32(v510, v516); + float32x2_t v49 = vrev64_f32(v43); + float32x2_t v51 = vadd_f32(v28, v42); + float32x2_t v52 = vsub_f32(v28, v42); + float32x2_t v83 = vadd_f32(v67, v81); + float32x2_t v84 = vsub_f32(v67, v81); + float32x2_t v99 = vmul_f32(v68, v791); + float32x2_t v110 = vmul_f32(v82, v802); + float32x2_t v166 = vrev64_f32(v160); + float32x2_t v168 = vadd_f32(v145, v159); + float32x2_t v169 = vsub_f32(v145, v159); + float32x2_t v205 = vrev64_f32(v199); + float32x2_t v207 = vadd_f32(v184, v198); + float32x2_t v208 = vsub_f32(v184, v198); + float32x2_t v368 = vrev64_f32(v362); + float32x2_t v370 = vadd_f32(v347, v361); + float32x2_t v371 = vsub_f32(v347, v361); + float32x2_t v402 = vadd_f32(v386, v400); + float32x2_t v403 = vsub_f32(v386, v400); + float32x2_t v418 = vmul_f32(v387, v791); + float32x2_t v429 = vmul_f32(v401, v802); + float32x2_t v485 = vrev64_f32(v479); + float32x2_t v487 = vadd_f32(v464, v478); + float32x2_t v488 = vsub_f32(v464, v478); + float32x2_t v519 = vadd_f32(v503, v517); + float32x2_t v520 = vsub_f32(v503, v517); + float32x2_t v535 = vmul_f32(v504, v791); + float32x2_t v546 = vmul_f32(v518, v802); + float32x2_t v50 = vmul_f32(v49, v808); + float32x2_t v90 = vrev64_f32(v84); + float32x2_t v92 = vadd_f32(v51, v83); + float32x2_t v93 = vsub_f32(v51, v83); + float32x2_t v105 = vrev64_f32(v99); + float32x2_t v116 = vrev64_f32(v110); + float32x2_t v167 = vmul_f32(v166, v808); + float32x2_t v206 = vmul_f32(v205, v808); + float32x2_t v211 = vadd_f32(v168, v207); + float32x2_t v212 = vsub_f32(v168, v207); + float32x2_t v264 = vmul_f32(v169, v791); + float32x2_t v275 = vmul_f32(v208, v802); + float32x2_t v369 = vmul_f32(v368, v808); + float32x2_t v409 = vrev64_f32(v403); + float32x2_t v411 = vadd_f32(v370, v402); + float32x2_t v412 = vsub_f32(v370, v402); + float32x2_t v424 = vrev64_f32(v418); + float32x2_t v435 = vrev64_f32(v429); + float32x2_t v486 = vmul_f32(v485, v808); + float32x2_t v526 = vrev64_f32(v520); + float32x2_t v528 = vadd_f32(v487, v519); + float32x2_t v529 = vsub_f32(v487, v519); + float32x2_t v541 = vrev64_f32(v535); + float32x2_t v552 = vrev64_f32(v546); + float32x2_t v53 = vsub_f32(v29, v50); + float32x2_t v54 = vadd_f32(v29, v50); + float32x2_t v91 = vmul_f32(v90, v808); + float32x2_t v106 = vmul_f32(v105, v1002); + float32x2_t v117 = vmul_f32(v116, v808); + float32x2_t v170 = vsub_f32(v146, v167); + float32x2_t v171 = vadd_f32(v146, v167); + float32x2_t v209 = vsub_f32(v185, v206); + float32x2_t v210 = vadd_f32(v185, v206); + float32x2_t v218 = vrev64_f32(v212); + float32x2_t v220 = vadd_f32(v92, v211); + float32x2_t v221 = vsub_f32(v92, v211); + float32x2_t v270 = vrev64_f32(v264); + float32x2_t v281 = vrev64_f32(v275); + float32x2_t v372 = vsub_f32(v348, v369); + float32x2_t v373 = vadd_f32(v348, v369); + float32x2_t v410 = vmul_f32(v409, v808); + float32x2_t v425 = vmul_f32(v424, v1002); + float32x2_t v436 = vmul_f32(v435, v808); + float32x2_t v489 = vsub_f32(v465, v486); + float32x2_t v490 = vadd_f32(v465, v486); + float32x2_t v527 = vmul_f32(v526, v808); + float32x2_t v542 = vmul_f32(v541, v1002); + float32x2_t v553 = vmul_f32(v552, v808); + float32x2_t v569 = vadd_f32(v411, v528); + float32x2_t v570 = vsub_f32(v411, v528); + float32x2_t v792 = vmul_f32(v412, v791); + float32x2_t v803 = vmul_f32(v529, v802); + float32x2_t v94 = vsub_f32(v52, v91); + float32x2_t v95 = vadd_f32(v52, v91); + float32x2_t v118 = vadd_f32(v99, v106); + float32x2_t v119 = vadd_f32(v110, v117); + float32x2_t v219 = vmul_f32(v218, v808); + float32x2_t v227 = vmul_f32(v170, v669); + float32x2_t v233 = vrev64_f32(v170); + float32x2_t v238 = vmul_f32(v209, v913); + float32x2_t v244 = vrev64_f32(v209); + float32x2_t v271 = vmul_f32(v270, v1002); + float32x2_t v282 = vmul_f32(v281, v808); + float32x2_t v301 = vmul_f32(v171, v913); + float32x2_t v307 = vrev64_f32(v171); + float32x2_t v312 = vmul_f32(v210, v924); + float32x2_t v318 = vrev64_f32(v210); + float32x2_t v413 = vsub_f32(v371, v410); + float32x2_t v414 = vadd_f32(v371, v410); + float32x2_t v437 = vadd_f32(v418, v425); + float32x2_t v438 = vadd_f32(v429, v436); + float32x2_t v530 = vsub_f32(v488, v527); + float32x2_t v531 = vadd_f32(v488, v527); + float32x2_t v554 = vadd_f32(v535, v542); + float32x2_t v555 = vadd_f32(v546, v553); + float32x2_t v576 = vrev64_f32(v570); + float32x2_t v578 = vadd_f32(v220, v569); + float32x2_t v579 = vsub_f32(v220, v569); + float32x2_t v798 = vrev64_f32(v792); + float32x2_t v809 = vrev64_f32(v803); + float32x2_t v120 = vadd_f32(v118, v119); + float32x2_t v121 = vsub_f32(v119, v118); + float32x2_t v222 = vsub_f32(v93, v219); + float32x2_t v223 = vadd_f32(v93, v219); + float32x2_t v283 = vadd_f32(v264, v271); + float32x2_t v284 = vadd_f32(v275, v282); + float32x2_t v439 = vadd_f32(v437, v438); + float32x2_t v440 = vsub_f32(v438, v437); + float32x2_t v556 = vadd_f32(v554, v555); + float32x2_t v557 = vsub_f32(v555, v554); + float32x2_t v577 = vmul_f32(v576, v808); + int16x4_t v584 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v578, 15), (int32x2_t){0, 0})); + int16x4_t v596 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v579, 15), (int32x2_t){0, 0})); + float32x2_t v670 = vmul_f32(v413, v669); + float32x2_t v676 = vrev64_f32(v413); + float32x2_t v681 = vmul_f32(v530, v913); + float32x2_t v687 = vrev64_f32(v530); + float32x2_t v799 = vmul_f32(v798, v1002); + float32x2_t v810 = vmul_f32(v809, v808); + float32x2_t v914 = vmul_f32(v414, v913); + float32x2_t v920 = vrev64_f32(v414); + float32x2_t v925 = vmul_f32(v531, v924); + float32x2_t v931 = vrev64_f32(v531); + float32x2_t v127 = vrev64_f32(v121); + float32x2_t v129 = vadd_f32(v53, v120); + float32x2_t v130 = vsub_f32(v53, v120); + float32x2_t v246 = vfma_f32(v227, v233, v675); + float32x2_t v247 = vfma_f32(v238, v244, v919); + float32x2_t v285 = vadd_f32(v283, v284); + float32x2_t v286 = vsub_f32(v284, v283); + float32x2_t v320 = vfma_f32(v301, v307, v919); + float32x2_t v321 = vfma_f32(v312, v318, v930); + float32x2_t v446 = vrev64_f32(v440); + float32x2_t v448 = vadd_f32(v372, v439); + float32x2_t v449 = vsub_f32(v372, v439); + float32x2_t v563 = vrev64_f32(v557); + float32x2_t v565 = vadd_f32(v489, v556); + float32x2_t v566 = vsub_f32(v489, v556); + float32x2_t v580 = vsub_f32(v221, v577); + float32x2_t v581 = vadd_f32(v221, v577); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v584), 0); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v596), 0); + float32x2_t v811 = vadd_f32(v792, v799); + float32x2_t v812 = vadd_f32(v803, v810); + float32x2_t v128 = vmul_f32(v127, v1002); + float32x2_t v248 = vadd_f32(v246, v247); + float32x2_t v249 = vsub_f32(v247, v246); + float32x2_t v292 = vrev64_f32(v286); + float32x2_t v294 = vadd_f32(v94, v285); + float32x2_t v295 = vsub_f32(v94, v285); + float32x2_t v322 = vadd_f32(v320, v321); + float32x2_t v323 = vsub_f32(v321, v320); + float32x2_t v447 = vmul_f32(v446, v1002); + float32x2_t v564 = vmul_f32(v563, v1002); + int16x4_t v590 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v580, 15), (int32x2_t){0, 0})); + int16x4_t v602 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v581, 15), (int32x2_t){0, 0})); + float32x2_t v609 = vmul_f32(v448, v608); + float32x2_t v615 = vrev64_f32(v448); + float32x2_t v620 = vmul_f32(v565, v730); + float32x2_t v626 = vrev64_f32(v565); + float32x2_t v689 = vfma_f32(v670, v676, v675); + float32x2_t v690 = vfma_f32(v681, v687, v919); + float32x2_t v813 = vadd_f32(v811, v812); + float32x2_t v814 = vsub_f32(v812, v811); + float32x2_t v853 = vmul_f32(v449, v852); + float32x2_t v859 = vrev64_f32(v449); + float32x2_t v864 = vmul_f32(v566, v863); + float32x2_t v870 = vrev64_f32(v566); + float32x2_t v933 = vfma_f32(v914, v920, v919); + float32x2_t v934 = vfma_f32(v925, v931, v930); + float32x2_t v131 = vsub_f32(v54, v128); + float32x2_t v132 = vadd_f32(v54, v128); + float32x2_t v255 = vrev64_f32(v249); + float32x2_t v257 = vadd_f32(v129, v248); + float32x2_t v258 = vsub_f32(v129, v248); + float32x2_t v293 = vmul_f32(v292, v1002); + float32x2_t v329 = vrev64_f32(v323); + float32x2_t v450 = vsub_f32(v373, v447); + float32x2_t v451 = vadd_f32(v373, v447); + float32x2_t v567 = vsub_f32(v490, v564); + float32x2_t v568 = vadd_f32(v490, v564); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v590), 0); + v6[ostride * 24] = vget_lane_s32(vreinterpret_s32_s16(v602), 0); + float32x2_t v691 = vadd_f32(v689, v690); + float32x2_t v692 = vsub_f32(v690, v689); + float32x2_t v820 = vrev64_f32(v814); + float32x2_t v822 = vadd_f32(v222, v813); + float32x2_t v823 = vsub_f32(v222, v813); + float32x2_t v935 = vadd_f32(v933, v934); + float32x2_t v936 = vsub_f32(v934, v933); + float32x2_t v256 = vmul_f32(v255, v1002); + float32x2_t v296 = vsub_f32(v95, v293); + float32x2_t v297 = vadd_f32(v95, v293); + float32x2_t v330 = vmul_f32(v329, v1002); + float32x2_t v331 = vadd_f32(v131, v322); + float32x2_t v332 = vsub_f32(v131, v322); + float32x2_t v628 = vfma_f32(v609, v615, v869); + float32x2_t v629 = vfma_f32(v620, v626, v736); + float32x2_t v698 = vrev64_f32(v692); + float32x2_t v700 = vadd_f32(v294, v691); + float32x2_t v701 = vsub_f32(v294, v691); + float32x2_t v731 = vmul_f32(v450, v730); + float32x2_t v737 = vrev64_f32(v450); + float32x2_t v742 = vmul_f32(v567, v741); + float32x2_t v748 = vrev64_f32(v567); + float32x2_t v821 = vmul_f32(v820, v1002); + int16x4_t v828 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v822, 15), (int32x2_t){0, 0})); + int16x4_t v840 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v823, 15), (int32x2_t){0, 0})); + float32x2_t v872 = vfma_f32(v853, v859, v858); + float32x2_t v873 = vfma_f32(v864, v870, v869); + float32x2_t v942 = vrev64_f32(v936); + float32x2_t v975 = vmul_f32(v451, v974); + float32x2_t v981 = vrev64_f32(v451); + float32x2_t v986 = vmul_f32(v568, v985); + float32x2_t v992 = vrev64_f32(v568); + float32x2_t v259 = vsub_f32(v130, v256); + float32x2_t v260 = vadd_f32(v130, v256); + float32x2_t v333 = vsub_f32(v132, v330); + float32x2_t v334 = vadd_f32(v132, v330); + float32x2_t v630 = vadd_f32(v628, v629); + float32x2_t v631 = vsub_f32(v629, v628); + float32x2_t v699 = vmul_f32(v698, v1002); + int16x4_t v706 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v700, 15), (int32x2_t){0, 0})); + int16x4_t v718 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v701, 15), (int32x2_t){0, 0})); + float32x2_t v824 = vsub_f32(v223, v821); + float32x2_t v825 = vadd_f32(v223, v821); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v828), 0); + v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v840), 0); + float32x2_t v874 = vadd_f32(v872, v873); + float32x2_t v875 = vsub_f32(v873, v872); + float32x2_t v943 = vmul_f32(v942, v1002); + float32x2_t v944 = vadd_f32(v296, v935); + float32x2_t v945 = vsub_f32(v296, v935); + float32x2_t v637 = vrev64_f32(v631); + float32x2_t v639 = vadd_f32(v257, v630); + float32x2_t v640 = vsub_f32(v257, v630); + float32x2_t v702 = vsub_f32(v295, v699); + float32x2_t v703 = vadd_f32(v295, v699); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v706), 0); + v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v718), 0); + float32x2_t v750 = vfma_f32(v731, v737, v736); + float32x2_t v751 = vfma_f32(v742, v748, v980); + int16x4_t v834 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v824, 15), (int32x2_t){0, 0})); + int16x4_t v846 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v825, 15), (int32x2_t){0, 0})); + float32x2_t v881 = vrev64_f32(v875); + float32x2_t v883 = vadd_f32(v259, v874); + float32x2_t v884 = vsub_f32(v259, v874); + float32x2_t v946 = vsub_f32(v297, v943); + float32x2_t v947 = vadd_f32(v297, v943); + int16x4_t v950 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v944, 15), (int32x2_t){0, 0})); + int16x4_t v962 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v945, 15), (int32x2_t){0, 0})); + float32x2_t v994 = vfma_f32(v975, v981, v980); + float32x2_t v995 = vfma_f32(v986, v992, v991); + float32x2_t v638 = vmul_f32(v637, v1002); + int16x4_t v645 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v639, 15), (int32x2_t){0, 0})); + int16x4_t v657 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v640, 15), (int32x2_t){0, 0})); + int16x4_t v712 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v702, 15), (int32x2_t){0, 0})); + int16x4_t v724 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v703, 15), (int32x2_t){0, 0})); + float32x2_t v752 = vadd_f32(v750, v751); + float32x2_t v753 = vsub_f32(v751, v750); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v834), 0); + v6[ostride * 28] = vget_lane_s32(vreinterpret_s32_s16(v846), 0); + float32x2_t v882 = vmul_f32(v881, v1002); + int16x4_t v889 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v883, 15), (int32x2_t){0, 0})); + int16x4_t v901 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v884, 15), (int32x2_t){0, 0})); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v950), 0); + int16x4_t v956 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v946, 15), (int32x2_t){0, 0})); + v6[ostride * 22] = vget_lane_s32(vreinterpret_s32_s16(v962), 0); + int16x4_t v968 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v947, 15), (int32x2_t){0, 0})); + float32x2_t v996 = vadd_f32(v994, v995); + float32x2_t v997 = vsub_f32(v995, v994); + float32x2_t v641 = vsub_f32(v258, v638); + float32x2_t v642 = vadd_f32(v258, v638); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v645), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v657), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v712), 0); + v6[ostride * 26] = vget_lane_s32(vreinterpret_s32_s16(v724), 0); + float32x2_t v759 = vrev64_f32(v753); + float32x2_t v761 = vadd_f32(v331, v752); + float32x2_t v762 = vsub_f32(v331, v752); + float32x2_t v885 = vsub_f32(v260, v882); + float32x2_t v886 = vadd_f32(v260, v882); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v889), 0); + v6[ostride * 21] = vget_lane_s32(vreinterpret_s32_s16(v901), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v956), 0); + v6[ostride * 30] = vget_lane_s32(vreinterpret_s32_s16(v968), 0); + float32x2_t v1003 = vrev64_f32(v997); + float32x2_t v1005 = vadd_f32(v333, v996); + float32x2_t v1006 = vsub_f32(v333, v996); + int16x4_t v651 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v641, 15), (int32x2_t){0, 0})); + int16x4_t v663 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v642, 15), (int32x2_t){0, 0})); + float32x2_t v760 = vmul_f32(v759, v1002); + int16x4_t v767 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v761, 15), (int32x2_t){0, 0})); + int16x4_t v779 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v762, 15), (int32x2_t){0, 0})); + int16x4_t v895 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v885, 15), (int32x2_t){0, 0})); + int16x4_t v907 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v886, 15), (int32x2_t){0, 0})); + float32x2_t v1004 = vmul_f32(v1003, v1002); + int16x4_t v1011 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1005, 15), (int32x2_t){0, 0})); + int16x4_t v1023 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1006, 15), (int32x2_t){0, 0})); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v651), 0); + v6[ostride * 25] = vget_lane_s32(vreinterpret_s32_s16(v663), 0); + float32x2_t v763 = vsub_f32(v332, v760); + float32x2_t v764 = vadd_f32(v332, v760); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v767), 0); + v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v779), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v895), 0); + v6[ostride * 29] = vget_lane_s32(vreinterpret_s32_s16(v907), 0); + float32x2_t v1007 = vsub_f32(v334, v1004); + float32x2_t v1008 = vadd_f32(v334, v1004); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v1011), 0); + v6[ostride * 23] = vget_lane_s32(vreinterpret_s32_s16(v1023), 0); + int16x4_t v773 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v763, 15), (int32x2_t){0, 0})); + int16x4_t v785 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v764, 15), (int32x2_t){0, 0})); + int16x4_t v1017 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1007, 15), (int32x2_t){0, 0})); + int16x4_t v1029 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1008, 15), (int32x2_t){0, 0})); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v773), 0); + v6[ostride * 27] = vget_lane_s32(vreinterpret_s32_s16(v785), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v1017), 0); + v6[ostride * 31] = vget_lane_s32(vreinterpret_s32_s16(v1029), 0); + v5 += 1 * 1; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uu32(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * 1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v847 = -1.9509032201612819e-01F; + float v906 = 7.0710678118654757e-01F; + float v918 = -7.0710678118654746e-01F; + float v923 = -1.0000000000000000e+00F; + float v977 = 5.5557023301960229e-01F; + float v982 = 8.3146961230254524e-01F; + float v989 = -9.8078528040323043e-01F; + float v1048 = 3.8268343236508984e-01F; + float v1053 = 9.2387953251128674e-01F; + float v1060 = -9.2387953251128685e-01F; + float v1065 = -3.8268343236508967e-01F; + float v1119 = 1.9509032201612833e-01F; + float v1124 = 9.8078528040323043e-01F; + float v1131 = -5.5557023301960218e-01F; + float v1136 = -8.3146961230254524e-01F; + const int32_t *v1365 = &v5[v0]; + int32_t *v1566 = &v6[v2]; + int64_t v27 = v0 * 16; + int64_t v37 = v0 * 8; + int64_t v45 = v0 * 24; + int64_t v66 = v0 * 4; + int64_t v74 = v0 * 20; + int64_t v84 = v0 * 12; + int64_t v92 = v0 * 28; + int64_t v154 = v0 * 2; + int64_t v162 = v0 * 18; + int64_t v172 = v0 * 10; + int64_t v180 = v0 * 26; + int64_t v201 = v0 * 6; + int64_t v209 = v0 * 22; + int64_t v219 = v0 * 14; + int64_t v227 = v0 * 30; + int64_t v386 = v0 * 17; + int64_t v396 = v0 * 9; + int64_t v404 = v0 * 25; + int64_t v425 = v0 * 5; + int64_t v433 = v0 * 21; + int64_t v443 = v0 * 13; + int64_t v451 = v0 * 29; + int64_t v513 = v0 * 3; + int64_t v521 = v0 * 19; + int64_t v531 = v0 * 11; + int64_t v539 = v0 * 27; + int64_t v560 = v0 * 7; + int64_t v568 = v0 * 23; + int64_t v578 = v0 * 15; + int64_t v586 = v0 * 31; + int64_t v669 = v2 * 8; + int64_t v677 = v2 * 16; + int64_t v685 = v2 * 24; + int64_t v740 = v2 * 9; + int64_t v748 = v2 * 17; + int64_t v756 = v2 * 25; + float v772 = v4 * v1048; + int64_t v803 = v2 * 2; + int64_t v811 = v2 * 10; + int64_t v819 = v2 * 18; + int64_t v827 = v2 * 26; + float v843 = v4 * v977; + int64_t v874 = v2 * 3; + int64_t v882 = v2 * 11; + int64_t v890 = v2 * 19; + int64_t v898 = v2 * 27; + float v926 = v4 * v923; + int64_t v945 = v2 * 4; + int64_t v953 = v2 * 12; + int64_t v961 = v2 * 20; + int64_t v969 = v2 * 28; + float v985 = v4 * v982; + float v997 = v4 * v1119; + int64_t v1016 = v2 * 5; + int64_t v1024 = v2 * 13; + int64_t v1032 = v2 * 21; + int64_t v1040 = v2 * 29; + float v1056 = v4 * v1053; + float v1068 = v4 * v1065; + int64_t v1087 = v2 * 6; + int64_t v1095 = v2 * 14; + int64_t v1103 = v2 * 22; + int64_t v1111 = v2 * 30; + float v1127 = v4 * v1124; + float v1139 = v4 * v1136; + int64_t v1158 = v2 * 7; + int64_t v1166 = v2 * 15; + int64_t v1174 = v2 * 23; + int64_t v1182 = v2 * 31; + const int32_t *v1196 = &v5[0]; + int32_t *v1525 = &v6[0]; + svfloat32_t v1555 = svdup_n_f32(v1124); + svfloat32_t v1596 = svdup_n_f32(v1053); + svfloat32_t v1637 = svdup_n_f32(v982); + svfloat32_t v1639 = svdup_n_f32(v847); + svfloat32_t v1678 = svdup_n_f32(v906); + svfloat32_t v1680 = svdup_n_f32(v918); + svfloat32_t v1719 = svdup_n_f32(v977); + svfloat32_t v1721 = svdup_n_f32(v989); + svfloat32_t v1760 = svdup_n_f32(v1048); + svfloat32_t v1762 = svdup_n_f32(v1060); + svfloat32_t v1801 = svdup_n_f32(v1119); + svfloat32_t v1803 = svdup_n_f32(v1131); + svfloat32_t v1805 = svdup_n_f32(v4); + svfloat32_t v384 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1365[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v1205 = &v5[v27]; + const int32_t *v1214 = &v5[v37]; + const int32_t *v1223 = &v5[v45]; + const int32_t *v1233 = &v5[v66]; + const int32_t *v1242 = &v5[v74]; + const int32_t *v1251 = &v5[v84]; + const int32_t *v1260 = &v5[v92]; + const int32_t *v1275 = &v5[v154]; + const int32_t *v1284 = &v5[v162]; + const int32_t *v1293 = &v5[v172]; + const int32_t *v1302 = &v5[v180]; + const int32_t *v1312 = &v5[v201]; + const int32_t *v1321 = &v5[v209]; + const int32_t *v1330 = &v5[v219]; + const int32_t *v1339 = &v5[v227]; + const int32_t *v1374 = &v5[v386]; + const int32_t *v1383 = &v5[v396]; + const int32_t *v1392 = &v5[v404]; + const int32_t *v1402 = &v5[v425]; + const int32_t *v1411 = &v5[v433]; + const int32_t *v1420 = &v5[v443]; + const int32_t *v1429 = &v5[v451]; + const int32_t *v1444 = &v5[v513]; + const int32_t *v1453 = &v5[v521]; + const int32_t *v1462 = &v5[v531]; + const int32_t *v1471 = &v5[v539]; + const int32_t *v1481 = &v5[v560]; + const int32_t *v1490 = &v5[v568]; + const int32_t *v1499 = &v5[v578]; + const int32_t *v1508 = &v5[v586]; + int32_t *v1534 = &v6[v669]; + int32_t *v1543 = &v6[v677]; + int32_t *v1552 = &v6[v685]; + int32_t *v1575 = &v6[v740]; + int32_t *v1584 = &v6[v748]; + int32_t *v1593 = &v6[v756]; + svfloat32_t v1597 = svdup_n_f32(v772); + int32_t *v1607 = &v6[v803]; + int32_t *v1616 = &v6[v811]; + int32_t *v1625 = &v6[v819]; + int32_t *v1634 = &v6[v827]; + svfloat32_t v1638 = svdup_n_f32(v843); + int32_t *v1648 = &v6[v874]; + int32_t *v1657 = &v6[v882]; + int32_t *v1666 = &v6[v890]; + int32_t *v1675 = &v6[v898]; + svfloat32_t v1681 = svdup_n_f32(v926); + int32_t *v1689 = &v6[v945]; + int32_t *v1698 = &v6[v953]; + int32_t *v1707 = &v6[v961]; + int32_t *v1716 = &v6[v969]; + svfloat32_t v1720 = svdup_n_f32(v985); + svfloat32_t v1722 = svdup_n_f32(v997); + int32_t *v1730 = &v6[v1016]; + int32_t *v1739 = &v6[v1024]; + int32_t *v1748 = &v6[v1032]; + int32_t *v1757 = &v6[v1040]; + svfloat32_t v1761 = svdup_n_f32(v1056); + svfloat32_t v1763 = svdup_n_f32(v1068); + int32_t *v1771 = &v6[v1087]; + int32_t *v1780 = &v6[v1095]; + int32_t *v1789 = &v6[v1103]; + int32_t *v1798 = &v6[v1111]; + svfloat32_t v1802 = svdup_n_f32(v1127); + svfloat32_t v1804 = svdup_n_f32(v1139); + int32_t *v1812 = &v6[v1158]; + int32_t *v1821 = &v6[v1166]; + int32_t *v1830 = &v6[v1174]; + int32_t *v1839 = &v6[v1182]; + svfloat32_t v25 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1196[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v33 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1205[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v43 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1214[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v51 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1223[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v72 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1233[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v80 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1242[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v90 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1251[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v98 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1260[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v160 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1275[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v168 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1284[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v178 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1293[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v186 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1302[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v207 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1312[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v215 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1321[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v225 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1330[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v233 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1339[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v392 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1374[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v402 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1383[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v410 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1392[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v431 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1402[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v439 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1411[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v449 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1420[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v457 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1429[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v519 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1444[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v527 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1453[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v537 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1462[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v545 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1471[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v566 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1481[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v574 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1490[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v584 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1499[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v592 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1508[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v81 = svadd_f32_x(svptrue_b32(), v72, v80); + svfloat32_t v82 = svsub_f32_x(svptrue_b32(), v72, v80); + svfloat32_t v99 = svadd_f32_x(svptrue_b32(), v90, v98); + svfloat32_t v100 = svsub_f32_x(svptrue_b32(), v90, v98); + svfloat32_t v169 = svadd_f32_x(svptrue_b32(), v160, v168); + svfloat32_t v170 = svsub_f32_x(svptrue_b32(), v160, v168); + svfloat32_t v187 = svadd_f32_x(svptrue_b32(), v178, v186); + svfloat32_t v188 = svsub_f32_x(svptrue_b32(), v178, v186); + svfloat32_t v216 = svadd_f32_x(svptrue_b32(), v207, v215); + svfloat32_t v217 = svsub_f32_x(svptrue_b32(), v207, v215); + svfloat32_t v234 = svadd_f32_x(svptrue_b32(), v225, v233); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v225, v233); + svfloat32_t v393 = svadd_f32_x(svptrue_b32(), v384, v392); + svfloat32_t v394 = svsub_f32_x(svptrue_b32(), v384, v392); + svfloat32_t v411 = svadd_f32_x(svptrue_b32(), v402, v410); + svfloat32_t v412 = svsub_f32_x(svptrue_b32(), v402, v410); + svfloat32_t v440 = svadd_f32_x(svptrue_b32(), v431, v439); + svfloat32_t v441 = svsub_f32_x(svptrue_b32(), v431, v439); + svfloat32_t v458 = svadd_f32_x(svptrue_b32(), v449, v457); + svfloat32_t v459 = svsub_f32_x(svptrue_b32(), v449, v457); + svfloat32_t v528 = svadd_f32_x(svptrue_b32(), v519, v527); + svfloat32_t v529 = svsub_f32_x(svptrue_b32(), v519, v527); + svfloat32_t v546 = svadd_f32_x(svptrue_b32(), v537, v545); + svfloat32_t v547 = svsub_f32_x(svptrue_b32(), v537, v545); + svfloat32_t v575 = svadd_f32_x(svptrue_b32(), v566, v574); + svfloat32_t v576 = svsub_f32_x(svptrue_b32(), v566, v574); + svfloat32_t v593 = svadd_f32_x(svptrue_b32(), v584, v592); + svfloat32_t v594 = svsub_f32_x(svptrue_b32(), v584, v592); + svfloat32_t zero60 = svdup_n_f32(0); + svfloat32_t v60 = svcmla_f32_x(pred_full, zero60, v1681, v53, 90); + svfloat32_t v61 = svadd_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v62 = svsub_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v101 = svadd_f32_x(svptrue_b32(), v81, v99); + svfloat32_t v102 = svsub_f32_x(svptrue_b32(), v81, v99); + svfloat32_t v118 = svmul_f32_x(svptrue_b32(), v82, v1678); + svfloat32_t v130 = svmul_f32_x(svptrue_b32(), v100, v1680); + svfloat32_t zero195 = svdup_n_f32(0); + svfloat32_t v195 = svcmla_f32_x(pred_full, zero195, v1681, v188, 90); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v169, v187); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v169, v187); + svfloat32_t zero242 = svdup_n_f32(0); + svfloat32_t v242 = svcmla_f32_x(pred_full, zero242, v1681, v235, 90); + svfloat32_t v243 = svadd_f32_x(svptrue_b32(), v216, v234); + svfloat32_t v244 = svsub_f32_x(svptrue_b32(), v216, v234); + svfloat32_t zero419 = svdup_n_f32(0); + svfloat32_t v419 = svcmla_f32_x(pred_full, zero419, v1681, v412, 90); + svfloat32_t v420 = svadd_f32_x(svptrue_b32(), v393, v411); + svfloat32_t v421 = svsub_f32_x(svptrue_b32(), v393, v411); + svfloat32_t v460 = svadd_f32_x(svptrue_b32(), v440, v458); + svfloat32_t v461 = svsub_f32_x(svptrue_b32(), v440, v458); + svfloat32_t v477 = svmul_f32_x(svptrue_b32(), v441, v1678); + svfloat32_t v489 = svmul_f32_x(svptrue_b32(), v459, v1680); + svfloat32_t zero554 = svdup_n_f32(0); + svfloat32_t v554 = svcmla_f32_x(pred_full, zero554, v1681, v547, 90); + svfloat32_t v555 = svadd_f32_x(svptrue_b32(), v528, v546); + svfloat32_t v556 = svsub_f32_x(svptrue_b32(), v528, v546); + svfloat32_t v595 = svadd_f32_x(svptrue_b32(), v575, v593); + svfloat32_t v596 = svsub_f32_x(svptrue_b32(), v575, v593); + svfloat32_t v612 = svmul_f32_x(svptrue_b32(), v576, v1678); + svfloat32_t v624 = svmul_f32_x(svptrue_b32(), v594, v1680); + svfloat32_t v63 = svsub_f32_x(svptrue_b32(), v35, v60); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v35, v60); + svfloat32_t zero109 = svdup_n_f32(0); + svfloat32_t v109 = svcmla_f32_x(pred_full, zero109, v1681, v102, 90); + svfloat32_t v110 = svadd_f32_x(svptrue_b32(), v61, v101); + svfloat32_t v111 = svsub_f32_x(svptrue_b32(), v61, v101); + svfloat32_t v198 = svsub_f32_x(svptrue_b32(), v170, v195); + svfloat32_t v199 = svadd_f32_x(svptrue_b32(), v170, v195); + svfloat32_t v245 = svsub_f32_x(svptrue_b32(), v217, v242); + svfloat32_t v246 = svadd_f32_x(svptrue_b32(), v217, v242); + svfloat32_t v247 = svadd_f32_x(svptrue_b32(), v196, v243); + svfloat32_t v248 = svsub_f32_x(svptrue_b32(), v196, v243); + svfloat32_t v303 = svmul_f32_x(svptrue_b32(), v197, v1678); + svfloat32_t v315 = svmul_f32_x(svptrue_b32(), v244, v1680); + svfloat32_t v422 = svsub_f32_x(svptrue_b32(), v394, v419); + svfloat32_t v423 = svadd_f32_x(svptrue_b32(), v394, v419); + svfloat32_t zero468 = svdup_n_f32(0); + svfloat32_t v468 = svcmla_f32_x(pred_full, zero468, v1681, v461, 90); + svfloat32_t v469 = svadd_f32_x(svptrue_b32(), v420, v460); + svfloat32_t v470 = svsub_f32_x(svptrue_b32(), v420, v460); + svfloat32_t v557 = svsub_f32_x(svptrue_b32(), v529, v554); + svfloat32_t v558 = svadd_f32_x(svptrue_b32(), v529, v554); + svfloat32_t zero603 = svdup_n_f32(0); + svfloat32_t v603 = svcmla_f32_x(pred_full, zero603, v1681, v596, 90); + svfloat32_t v604 = svadd_f32_x(svptrue_b32(), v555, v595); + svfloat32_t v605 = svsub_f32_x(svptrue_b32(), v555, v595); + svfloat32_t v112 = svsub_f32_x(svptrue_b32(), v62, v109); + svfloat32_t v113 = svadd_f32_x(svptrue_b32(), v62, v109); + svfloat32_t v138 = svcmla_f32_x(pred_full, v118, v1805, v118, 90); + svfloat32_t v139 = svcmla_f32_x(pred_full, v130, v1681, v130, 90); + svfloat32_t zero255 = svdup_n_f32(0); + svfloat32_t v255 = svcmla_f32_x(pred_full, zero255, v1681, v248, 90); + svfloat32_t v256 = svadd_f32_x(svptrue_b32(), v110, v247); + svfloat32_t v257 = svsub_f32_x(svptrue_b32(), v110, v247); + svfloat32_t v264 = svmul_f32_x(svptrue_b32(), v198, v1596); + svfloat32_t v276 = svmul_f32_x(svptrue_b32(), v245, v1760); + svfloat32_t v342 = svmul_f32_x(svptrue_b32(), v199, v1760); + svfloat32_t v354 = svmul_f32_x(svptrue_b32(), v246, v1762); + svfloat32_t v471 = svsub_f32_x(svptrue_b32(), v421, v468); + svfloat32_t v472 = svadd_f32_x(svptrue_b32(), v421, v468); + svfloat32_t v497 = svcmla_f32_x(pred_full, v477, v1805, v477, 90); + svfloat32_t v498 = svcmla_f32_x(pred_full, v489, v1681, v489, 90); + svfloat32_t v606 = svsub_f32_x(svptrue_b32(), v556, v603); + svfloat32_t v607 = svadd_f32_x(svptrue_b32(), v556, v603); + svfloat32_t v632 = svcmla_f32_x(pred_full, v612, v1805, v612, 90); + svfloat32_t v633 = svcmla_f32_x(pred_full, v624, v1681, v624, 90); + svfloat32_t v647 = svadd_f32_x(svptrue_b32(), v469, v604); + svfloat32_t v648 = svsub_f32_x(svptrue_b32(), v469, v604); + svfloat32_t v909 = svmul_f32_x(svptrue_b32(), v470, v1678); + svfloat32_t v921 = svmul_f32_x(svptrue_b32(), v605, v1680); + svfloat32_t v140 = svadd_f32_x(svptrue_b32(), v138, v139); + svfloat32_t v141 = svsub_f32_x(svptrue_b32(), v139, v138); + svfloat32_t v258 = svsub_f32_x(svptrue_b32(), v111, v255); + svfloat32_t v259 = svadd_f32_x(svptrue_b32(), v111, v255); + svfloat32_t v284 = svcmla_f32_x(pred_full, v264, v1597, v198, 90); + svfloat32_t v285 = svcmla_f32_x(pred_full, v276, v1761, v245, 90); + svfloat32_t v323 = svcmla_f32_x(pred_full, v303, v1805, v303, 90); + svfloat32_t v324 = svcmla_f32_x(pred_full, v315, v1681, v315, 90); + svfloat32_t v362 = svcmla_f32_x(pred_full, v342, v1761, v199, 90); + svfloat32_t v363 = svcmla_f32_x(pred_full, v354, v1763, v246, 90); + svfloat32_t v499 = svadd_f32_x(svptrue_b32(), v497, v498); + svfloat32_t v500 = svsub_f32_x(svptrue_b32(), v498, v497); + svfloat32_t v634 = svadd_f32_x(svptrue_b32(), v632, v633); + svfloat32_t v635 = svsub_f32_x(svptrue_b32(), v633, v632); + svfloat32_t zero655 = svdup_n_f32(0); + svfloat32_t v655 = svcmla_f32_x(pred_full, zero655, v1681, v648, 90); + svfloat32_t v656 = svadd_f32_x(svptrue_b32(), v256, v647); + svfloat32_t v657 = svsub_f32_x(svptrue_b32(), v256, v647); + svfloat32_t v767 = svmul_f32_x(svptrue_b32(), v471, v1596); + svfloat32_t v779 = svmul_f32_x(svptrue_b32(), v606, v1760); + svfloat32_t v1051 = svmul_f32_x(svptrue_b32(), v472, v1760); + svfloat32_t v1063 = svmul_f32_x(svptrue_b32(), v607, v1762); + svfloat32_t zero148 = svdup_n_f32(0); + svfloat32_t v148 = svcmla_f32_x(pred_full, zero148, v1805, v141, 90); + svfloat32_t v149 = svadd_f32_x(svptrue_b32(), v63, v140); + svfloat32_t v150 = svsub_f32_x(svptrue_b32(), v63, v140); + svfloat32_t v286 = svadd_f32_x(svptrue_b32(), v284, v285); + svfloat32_t v287 = svsub_f32_x(svptrue_b32(), v285, v284); + svfloat32_t v325 = svadd_f32_x(svptrue_b32(), v323, v324); + svfloat32_t v326 = svsub_f32_x(svptrue_b32(), v324, v323); + svfloat32_t v364 = svadd_f32_x(svptrue_b32(), v362, v363); + svfloat32_t v365 = svsub_f32_x(svptrue_b32(), v363, v362); + svfloat32_t zero507 = svdup_n_f32(0); + svfloat32_t v507 = svcmla_f32_x(pred_full, zero507, v1805, v500, 90); + svfloat32_t v508 = svadd_f32_x(svptrue_b32(), v422, v499); + svfloat32_t v509 = svsub_f32_x(svptrue_b32(), v422, v499); + svfloat32_t zero642 = svdup_n_f32(0); + svfloat32_t v642 = svcmla_f32_x(pred_full, zero642, v1805, v635, 90); + svfloat32_t v643 = svadd_f32_x(svptrue_b32(), v557, v634); + svfloat32_t v644 = svsub_f32_x(svptrue_b32(), v557, v634); + svfloat32_t v658 = svsub_f32_x(svptrue_b32(), v257, v655); + svfloat32_t v659 = svadd_f32_x(svptrue_b32(), v257, v655); + svint16_t v662 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v656, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v678 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v657, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v787 = svcmla_f32_x(pred_full, v767, v1597, v471, 90); + svfloat32_t v788 = svcmla_f32_x(pred_full, v779, v1761, v606, 90); + svfloat32_t v929 = svcmla_f32_x(pred_full, v909, v1805, v909, 90); + svfloat32_t v930 = svcmla_f32_x(pred_full, v921, v1681, v921, 90); + svfloat32_t v1071 = svcmla_f32_x(pred_full, v1051, v1761, v472, 90); + svfloat32_t v1072 = svcmla_f32_x(pred_full, v1063, v1763, v607, 90); + svfloat32_t v151 = svsub_f32_x(svptrue_b32(), v64, v148); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v64, v148); + svfloat32_t zero294 = svdup_n_f32(0); + svfloat32_t v294 = svcmla_f32_x(pred_full, zero294, v1805, v287, 90); + svfloat32_t v295 = svadd_f32_x(svptrue_b32(), v149, v286); + svfloat32_t v296 = svsub_f32_x(svptrue_b32(), v149, v286); + svfloat32_t zero333 = svdup_n_f32(0); + svfloat32_t v333 = svcmla_f32_x(pred_full, zero333, v1805, v326, 90); + svfloat32_t v334 = svadd_f32_x(svptrue_b32(), v112, v325); + svfloat32_t v335 = svsub_f32_x(svptrue_b32(), v112, v325); + svfloat32_t zero372 = svdup_n_f32(0); + svfloat32_t v372 = svcmla_f32_x(pred_full, zero372, v1805, v365, 90); + svfloat32_t v510 = svsub_f32_x(svptrue_b32(), v423, v507); + svfloat32_t v511 = svadd_f32_x(svptrue_b32(), v423, v507); + svfloat32_t v645 = svsub_f32_x(svptrue_b32(), v558, v642); + svfloat32_t v646 = svadd_f32_x(svptrue_b32(), v558, v642); + svint16_t v670 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v658, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v686 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v659, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v696 = svmul_f32_x(svptrue_b32(), v508, v1555); + svfloat32_t v708 = svmul_f32_x(svptrue_b32(), v643, v1637); + svfloat32_t v789 = svadd_f32_x(svptrue_b32(), v787, v788); + svfloat32_t v790 = svsub_f32_x(svptrue_b32(), v788, v787); + svfloat32_t v931 = svadd_f32_x(svptrue_b32(), v929, v930); + svfloat32_t v932 = svsub_f32_x(svptrue_b32(), v930, v929); + svfloat32_t v980 = svmul_f32_x(svptrue_b32(), v509, v1719); + svfloat32_t v992 = svmul_f32_x(svptrue_b32(), v644, v1721); + svfloat32_t v1073 = svadd_f32_x(svptrue_b32(), v1071, v1072); + svfloat32_t v1074 = svsub_f32_x(svptrue_b32(), v1072, v1071); + svst1w_u64(pred_full, (unsigned *)(v1525), svreinterpret_u64_s16(v662)); + svst1w_u64(pred_full, (unsigned *)(v1543), svreinterpret_u64_s16(v678)); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v150, v294); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v150, v294); + svfloat32_t v336 = svsub_f32_x(svptrue_b32(), v113, v333); + svfloat32_t v337 = svadd_f32_x(svptrue_b32(), v113, v333); + svfloat32_t v373 = svadd_f32_x(svptrue_b32(), v151, v364); + svfloat32_t v374 = svsub_f32_x(svptrue_b32(), v151, v364); + svfloat32_t v375 = svsub_f32_x(svptrue_b32(), v152, v372); + svfloat32_t v376 = svadd_f32_x(svptrue_b32(), v152, v372); + svfloat32_t v716 = svcmla_f32_x(pred_full, v696, v1722, v508, 90); + svfloat32_t v717 = svcmla_f32_x(pred_full, v708, v1638, v643, 90); + svfloat32_t zero797 = svdup_n_f32(0); + svfloat32_t v797 = svcmla_f32_x(pred_full, zero797, v1805, v790, 90); + svfloat32_t v798 = svadd_f32_x(svptrue_b32(), v334, v789); + svfloat32_t v799 = svsub_f32_x(svptrue_b32(), v334, v789); + svfloat32_t v838 = svmul_f32_x(svptrue_b32(), v510, v1637); + svfloat32_t v850 = svmul_f32_x(svptrue_b32(), v645, v1639); + svfloat32_t zero939 = svdup_n_f32(0); + svfloat32_t v939 = svcmla_f32_x(pred_full, zero939, v1805, v932, 90); + svfloat32_t v940 = svadd_f32_x(svptrue_b32(), v258, v931); + svfloat32_t v941 = svsub_f32_x(svptrue_b32(), v258, v931); + svfloat32_t v1000 = svcmla_f32_x(pred_full, v980, v1720, v509, 90); + svfloat32_t v1001 = svcmla_f32_x(pred_full, v992, v1722, v644, 90); + svfloat32_t zero1081 = svdup_n_f32(0); + svfloat32_t v1081 = svcmla_f32_x(pred_full, zero1081, v1805, v1074, 90); + svfloat32_t v1122 = svmul_f32_x(svptrue_b32(), v511, v1801); + svfloat32_t v1134 = svmul_f32_x(svptrue_b32(), v646, v1803); + svst1w_u64(pred_full, (unsigned *)(v1534), svreinterpret_u64_s16(v670)); + svst1w_u64(pred_full, (unsigned *)(v1552), svreinterpret_u64_s16(v686)); + svfloat32_t v718 = svadd_f32_x(svptrue_b32(), v716, v717); + svfloat32_t v719 = svsub_f32_x(svptrue_b32(), v717, v716); + svfloat32_t v800 = svsub_f32_x(svptrue_b32(), v335, v797); + svfloat32_t v801 = svadd_f32_x(svptrue_b32(), v335, v797); + svint16_t v804 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v798, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v820 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v799, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v858 = svcmla_f32_x(pred_full, v838, v1638, v510, 90); + svfloat32_t v859 = svcmla_f32_x(pred_full, v850, v1802, v645, 90); + svfloat32_t v942 = svsub_f32_x(svptrue_b32(), v259, v939); + svfloat32_t v943 = svadd_f32_x(svptrue_b32(), v259, v939); + svint16_t v946 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v940, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v962 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v941, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v1002 = svadd_f32_x(svptrue_b32(), v1000, v1001); + svfloat32_t v1003 = svsub_f32_x(svptrue_b32(), v1001, v1000); + svfloat32_t v1082 = svadd_f32_x(svptrue_b32(), v336, v1073); + svfloat32_t v1083 = svsub_f32_x(svptrue_b32(), v336, v1073); + svfloat32_t v1084 = svsub_f32_x(svptrue_b32(), v337, v1081); + svfloat32_t v1085 = svadd_f32_x(svptrue_b32(), v337, v1081); + svfloat32_t v1142 = svcmla_f32_x(pred_full, v1122, v1802, v511, 90); + svfloat32_t v1143 = svcmla_f32_x(pred_full, v1134, v1804, v646, 90); + svfloat32_t zero726 = svdup_n_f32(0); + svfloat32_t v726 = svcmla_f32_x(pred_full, zero726, v1805, v719, 90); + svfloat32_t v727 = svadd_f32_x(svptrue_b32(), v295, v718); + svfloat32_t v728 = svsub_f32_x(svptrue_b32(), v295, v718); + svint16_t v812 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v800, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v828 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v801, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v860 = svadd_f32_x(svptrue_b32(), v858, v859); + svfloat32_t v861 = svsub_f32_x(svptrue_b32(), v859, v858); + svint16_t v954 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v942, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v970 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v943, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t zero1010 = svdup_n_f32(0); + svfloat32_t v1010 = svcmla_f32_x(pred_full, zero1010, v1805, v1003, 90); + svfloat32_t v1011 = svadd_f32_x(svptrue_b32(), v297, v1002); + svfloat32_t v1012 = svsub_f32_x(svptrue_b32(), v297, v1002); + svint16_t v1088 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1082, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1096 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1084, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1104 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1083, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1112 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1085, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1144 = svadd_f32_x(svptrue_b32(), v1142, v1143); + svfloat32_t v1145 = svsub_f32_x(svptrue_b32(), v1143, v1142); + svst1w_u64(pred_full, (unsigned *)(v1607), svreinterpret_u64_s16(v804)); + svst1w_u64(pred_full, (unsigned *)(v1625), svreinterpret_u64_s16(v820)); + svst1w_u64(pred_full, (unsigned *)(v1689), svreinterpret_u64_s16(v946)); + svst1w_u64(pred_full, (unsigned *)(v1707), svreinterpret_u64_s16(v962)); + svfloat32_t v729 = svsub_f32_x(svptrue_b32(), v296, v726); + svfloat32_t v730 = svadd_f32_x(svptrue_b32(), v296, v726); + svint16_t v733 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v727, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v749 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v728, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t zero868 = svdup_n_f32(0); + svfloat32_t v868 = svcmla_f32_x(pred_full, zero868, v1805, v861, 90); + svfloat32_t v869 = svadd_f32_x(svptrue_b32(), v373, v860); + svfloat32_t v870 = svsub_f32_x(svptrue_b32(), v373, v860); + svfloat32_t v1013 = svsub_f32_x(svptrue_b32(), v298, v1010); + svfloat32_t v1014 = svadd_f32_x(svptrue_b32(), v298, v1010); + svint16_t v1017 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1011, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1033 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1012, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t zero1152 = svdup_n_f32(0); + svfloat32_t v1152 = svcmla_f32_x(pred_full, zero1152, v1805, v1145, 90); + svfloat32_t v1153 = svadd_f32_x(svptrue_b32(), v375, v1144); + svfloat32_t v1154 = svsub_f32_x(svptrue_b32(), v375, v1144); + svst1w_u64(pred_full, (unsigned *)(v1616), svreinterpret_u64_s16(v812)); + svst1w_u64(pred_full, (unsigned *)(v1634), svreinterpret_u64_s16(v828)); + svst1w_u64(pred_full, (unsigned *)(v1698), svreinterpret_u64_s16(v954)); + svst1w_u64(pred_full, (unsigned *)(v1716), svreinterpret_u64_s16(v970)); + svst1w_u64(pred_full, (unsigned *)(v1771), svreinterpret_u64_s16(v1088)); + svst1w_u64(pred_full, (unsigned *)(v1780), svreinterpret_u64_s16(v1096)); + svst1w_u64(pred_full, (unsigned *)(v1789), svreinterpret_u64_s16(v1104)); + svst1w_u64(pred_full, (unsigned *)(v1798), svreinterpret_u64_s16(v1112)); + svint16_t v741 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v729, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v757 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v730, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v871 = svsub_f32_x(svptrue_b32(), v374, v868); + svfloat32_t v872 = svadd_f32_x(svptrue_b32(), v374, v868); + svint16_t v875 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v869, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v891 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v870, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1025 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1013, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1041 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1014, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svfloat32_t v1155 = svsub_f32_x(svptrue_b32(), v376, v1152); + svfloat32_t v1156 = svadd_f32_x(svptrue_b32(), v376, v1152); + svint16_t v1159 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1153, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1175 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1154, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v1566), svreinterpret_u64_s16(v733)); + svst1w_u64(pred_full, (unsigned *)(v1584), svreinterpret_u64_s16(v749)); + svst1w_u64(pred_full, (unsigned *)(v1730), svreinterpret_u64_s16(v1017)); + svst1w_u64(pred_full, (unsigned *)(v1748), svreinterpret_u64_s16(v1033)); + svint16_t v883 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v871, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v899 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v872, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1167 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1155, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svint16_t v1183 = + svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, + svmul_n_f32_x(pred_full, v1156, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, + 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v1575), svreinterpret_u64_s16(v741)); + svst1w_u64(pred_full, (unsigned *)(v1593), svreinterpret_u64_s16(v757)); + svst1w_u64(pred_full, (unsigned *)(v1648), svreinterpret_u64_s16(v875)); + svst1w_u64(pred_full, (unsigned *)(v1666), svreinterpret_u64_s16(v891)); + svst1w_u64(pred_full, (unsigned *)(v1739), svreinterpret_u64_s16(v1025)); + svst1w_u64(pred_full, (unsigned *)(v1757), svreinterpret_u64_s16(v1041)); + svst1w_u64(pred_full, (unsigned *)(v1812), svreinterpret_u64_s16(v1159)); + svst1w_u64(pred_full, (unsigned *)(v1830), svreinterpret_u64_s16(v1175)); + svst1w_u64(pred_full, (unsigned *)(v1657), svreinterpret_u64_s16(v883)); + svst1w_u64(pred_full, (unsigned *)(v1675), svreinterpret_u64_s16(v899)); + svst1w_u64(pred_full, (unsigned *)(v1821), svreinterpret_u64_s16(v1167)); + svst1w_u64(pred_full, (unsigned *)(v1839), svreinterpret_u64_s16(v1183)); + v5 += v11; + v6 += v12; + } +} +#endif diff --git a/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.h b/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.h new file mode 100644 index 0000000000000000000000000000000000000000..932992322b738c090771c12c7e346b3a0220b960 --- /dev/null +++ b/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.h @@ -0,0 +1,47 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#pragma once + +#include "armral.h" +#include "fft_helper.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void(cs16_cf32_cs16_ac_n_uu_fft_t)(const armral_cmplx_int16_t *x, + armral_cmplx_int16_t *y, int istride, + int ostride, int howmany, float dir); + +cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu2; +cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu3; +cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu4; +cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu5; +cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu6; +cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu7; +cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu8; +cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu9; +cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu10; +cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu11; +cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu12; +cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu13; +cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu14; +cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu15; +cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu16; +cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu17; +cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu18; +cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu19; +cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu20; +cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu21; +cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu22; +cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu24; +cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu25; +cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu32; + +#ifdef __cplusplus +} // extern "C" +#endif \ No newline at end of file diff --git a/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c b/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c index 1ee3d7c937caec92ab066ebcb7ca2f92ffbd5158..2880868d60375e5321cec102890d99d8cd87d663 100644 --- a/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c +++ b/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c @@ -101,6 +101,51 @@ static cs16_cf32_cf32_ac_n_uu_fft_t NULL, }; +static cs16_cf32_cf32_ac_n_gu_fft_t + *base_cs16_cf32_cf32_ac_n_gu_kernels[NUM_FFT_CS16_BASE_KERNELS] = { + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + armral_fft_cs16_cf32_cf32_ac_n_gu7, + NULL, + armral_fft_cs16_cf32_cf32_ac_n_gu9, + NULL, + armral_fft_cs16_cf32_cf32_ac_n_gu11, + NULL, + armral_fft_cs16_cf32_cf32_ac_n_gu13, + armral_fft_cs16_cf32_cf32_ac_n_gu14, + armral_fft_cs16_cf32_cf32_ac_n_gu15, + armral_fft_cs16_cf32_cf32_ac_n_gu16, + armral_fft_cs16_cf32_cf32_ac_n_gu17, + armral_fft_cs16_cf32_cf32_ac_n_gu18, + armral_fft_cs16_cf32_cf32_ac_n_gu19, + armral_fft_cs16_cf32_cf32_ac_n_gu20, + armral_fft_cs16_cf32_cf32_ac_n_gu21, + armral_fft_cs16_cf32_cf32_ac_n_gu22, + NULL, + armral_fft_cs16_cf32_cf32_ac_n_gu24, + armral_fft_cs16_cf32_cf32_ac_n_gu25, + NULL, + NULL, + armral_fft_cs16_cf32_cf32_ac_n_gu28, + NULL, + armral_fft_cs16_cf32_cf32_ac_n_gu30, + NULL, + armral_fft_cs16_cf32_cf32_ac_n_gu32, + NULL, + NULL, + NULL, + armral_fft_cs16_cf32_cf32_ac_n_gu36, + NULL, + NULL, + NULL, + armral_fft_cs16_cf32_cf32_ac_n_gu40, +}; + static cf32_cf32_cs16_ab_t_gu_fft_t *base_cf32_cf32_cs16_ab_t_gu_kernels[NUM_FFT_CS16_BASE_KERNELS] = { NULL, @@ -146,6 +191,51 @@ static cf32_cf32_cs16_ab_t_gu_fft_t NULL, }; +static cf32_cf32_cs16_ab_t_gs_fft_t + *base_cf32_cf32_cs16_ab_t_gs_kernels[NUM_FFT_CS16_BASE_KERNELS] = { + NULL, + NULL, + armral_fft_cf32_cf32_cs16_ab_t_gs2, + armral_fft_cf32_cf32_cs16_ab_t_gs3, + armral_fft_cf32_cf32_cs16_ab_t_gs4, + armral_fft_cf32_cf32_cs16_ab_t_gs5, + armral_fft_cf32_cf32_cs16_ab_t_gs6, + armral_fft_cf32_cf32_cs16_ab_t_gs7, + armral_fft_cf32_cf32_cs16_ab_t_gs8, + armral_fft_cf32_cf32_cs16_ab_t_gs9, + armral_fft_cf32_cf32_cs16_ab_t_gs10, + armral_fft_cf32_cf32_cs16_ab_t_gs11, + armral_fft_cf32_cf32_cs16_ab_t_gs12, + armral_fft_cf32_cf32_cs16_ab_t_gs13, + armral_fft_cf32_cf32_cs16_ab_t_gs14, + armral_fft_cf32_cf32_cs16_ab_t_gs15, + armral_fft_cf32_cf32_cs16_ab_t_gs16, + armral_fft_cf32_cf32_cs16_ab_t_gs17, + armral_fft_cf32_cf32_cs16_ab_t_gs18, + armral_fft_cf32_cf32_cs16_ab_t_gs19, + armral_fft_cf32_cf32_cs16_ab_t_gs20, + armral_fft_cf32_cf32_cs16_ab_t_gs21, + armral_fft_cf32_cf32_cs16_ab_t_gs22, + NULL, + armral_fft_cf32_cf32_cs16_ab_t_gs24, + armral_fft_cf32_cf32_cs16_ab_t_gs25, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + armral_fft_cf32_cf32_cs16_ab_t_gs32, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, +}; + static cf32_cf32_cs16_ac_n_uu_fft_t *base_cf32_cf32_cs16_ac_n_uu_kernels[NUM_FFT_CS16_BASE_KERNELS] = { NULL, @@ -191,6 +281,51 @@ static cf32_cf32_cs16_ac_n_uu_fft_t NULL, }; +static cs16_cf32_cs16_ac_n_uu_fft_t + *base_cs16_cf32_cs16_ac_n_uu_kernels[NUM_FFT_CS16_BASE_KERNELS] = { + NULL, + NULL, + armral_fft_cs16_cf32_cs16_ac_n_uu2, + armral_fft_cs16_cf32_cs16_ac_n_uu3, + armral_fft_cs16_cf32_cs16_ac_n_uu4, + armral_fft_cs16_cf32_cs16_ac_n_uu5, + armral_fft_cs16_cf32_cs16_ac_n_uu6, + armral_fft_cs16_cf32_cs16_ac_n_uu7, + armral_fft_cs16_cf32_cs16_ac_n_uu8, + armral_fft_cs16_cf32_cs16_ac_n_uu9, + armral_fft_cs16_cf32_cs16_ac_n_uu10, + armral_fft_cs16_cf32_cs16_ac_n_uu11, + armral_fft_cs16_cf32_cs16_ac_n_uu12, + armral_fft_cs16_cf32_cs16_ac_n_uu13, + armral_fft_cs16_cf32_cs16_ac_n_uu14, + armral_fft_cs16_cf32_cs16_ac_n_uu15, + armral_fft_cs16_cf32_cs16_ac_n_uu16, + armral_fft_cs16_cf32_cs16_ac_n_uu17, + armral_fft_cs16_cf32_cs16_ac_n_uu18, + armral_fft_cs16_cf32_cs16_ac_n_uu19, + armral_fft_cs16_cf32_cs16_ac_n_uu20, + armral_fft_cs16_cf32_cs16_ac_n_uu21, + armral_fft_cs16_cf32_cs16_ac_n_uu22, + NULL, + armral_fft_cs16_cf32_cs16_ac_n_uu24, + armral_fft_cs16_cf32_cs16_ac_n_uu25, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + armral_fft_cs16_cf32_cs16_ac_n_uu32, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, +}; + cs16_cf32_cs16_ac_n_uun_fft_t * lookup_ac_uun_base_kernel_cs16_cs16(int n, armral_fft_direction_t dir) { if (n >= NUM_FFT_CS16_BASE_KERNELS) { @@ -207,6 +342,14 @@ lookup_ac_uu_base_kernel_cs16_cf32(int n, armral_fft_direction_t dir) { return base_cs16_cf32_cf32_ac_n_uu_kernels[n]; } +cs16_cf32_cf32_ac_n_gu_fft_t * +lookup_ac_gu_base_kernel_cs16_cf32(int n, armral_fft_direction_t dir) { + if (n >= NUM_FFT_CS16_BASE_KERNELS) { + return NULL; + } + return base_cs16_cf32_cf32_ac_n_gu_kernels[n]; +} + cf32_cf32_cs16_ab_t_gu_fft_t * lookup_ab_twiddle_gu_base_kernel_cf32_cs16(int n, armral_fft_direction_t dir) { if (n >= NUM_FFT_CS16_BASE_KERNELS) { @@ -215,6 +358,14 @@ lookup_ab_twiddle_gu_base_kernel_cf32_cs16(int n, armral_fft_direction_t dir) { return base_cf32_cf32_cs16_ab_t_gu_kernels[n]; } +cf32_cf32_cs16_ab_t_gs_fft_t * +lookup_ab_twiddle_gs_base_kernel_cf32_cs16(int n, armral_fft_direction_t dir) { + if (n >= NUM_FFT_CS16_BASE_KERNELS) { + return NULL; + } + return base_cf32_cf32_cs16_ab_t_gs_kernels[n]; +} + cf32_cf32_cs16_ac_n_uu_fft_t * lookup_ac_uu_base_kernel_cf32_cs16(int n, armral_fft_direction_t dir) { if (n >= NUM_FFT_CS16_BASE_KERNELS) { @@ -222,3 +373,11 @@ lookup_ac_uu_base_kernel_cf32_cs16(int n, armral_fft_direction_t dir) { } return base_cf32_cf32_cs16_ac_n_uu_kernels[n]; } + +cs16_cf32_cs16_ac_n_uu_fft_t * +lookup_ac_uu_base_kernel_cs16_cs16(int n, armral_fft_direction_t dir) { + if (n >= NUM_FFT_CS16_BASE_KERNELS) { + return NULL; + } + return base_cs16_cf32_cs16_ac_n_uu_kernels[n]; +} diff --git a/src/LowerPHY/FFT/fft_cs16_kernel_lookup.h b/src/LowerPHY/FFT/fft_cs16_kernel_lookup.h index f85f5e0cc85f0e0222925020da07a872dc720100..fb874cd97ca8122c6605f50ee4b572b0833f4422 100644 --- a/src/LowerPHY/FFT/fft_cs16_kernel_lookup.h +++ b/src/LowerPHY/FFT/fft_cs16_kernel_lookup.h @@ -6,9 +6,12 @@ */ #pragma once +#include "fft_cf32_cf32_cs16_ab_t_gs.h" #include "fft_cf32_cf32_cs16_ab_t_gu.h" #include "fft_cf32_cf32_cs16_ac_n_uu.h" +#include "fft_cs16_cf32_cf32_ac_n_gu.h" #include "fft_cs16_cf32_cf32_ac_n_uu.h" +#include "fft_cs16_cf32_cs16_ac_n_uu.h" #include "fft_cs16_cf32_cs16_ac_n_uun.h" #ifdef __cplusplus @@ -21,12 +24,21 @@ lookup_ac_uun_base_kernel_cs16_cs16(int n, armral_fft_direction_t dir); cs16_cf32_cf32_ac_n_uu_fft_t * lookup_ac_uu_base_kernel_cs16_cf32(int n, armral_fft_direction_t dir); +cs16_cf32_cf32_ac_n_gu_fft_t * +lookup_ac_gu_base_kernel_cs16_cf32(int n, armral_fft_direction_t dir); + cf32_cf32_cs16_ab_t_gu_fft_t * lookup_ab_twiddle_gu_base_kernel_cf32_cs16(int n, armral_fft_direction_t dir); +cf32_cf32_cs16_ab_t_gs_fft_t * +lookup_ab_twiddle_gs_base_kernel_cf32_cs16(int n, armral_fft_direction_t dir); + cf32_cf32_cs16_ac_n_uu_fft_t * lookup_ac_uu_base_kernel_cf32_cs16(int n, armral_fft_direction_t dir); +cs16_cf32_cs16_ac_n_uu_fft_t * +lookup_ac_uu_base_kernel_cs16_cs16(int n, armral_fft_direction_t dir); + #ifdef __cplusplus } // extern "C" #endif \ No newline at end of file diff --git a/src/LowerPHY/FFT/fft_execute.cpp b/src/LowerPHY/FFT/fft_execute.cpp index 6960837a20e3fb553e69b0caab41aad54b71d654..f5d3dbdbd11fabe618b4a9749e6bccee4d717bc3 100644 --- a/src/LowerPHY/FFT/fft_execute.cpp +++ b/src/LowerPHY/FFT/fft_execute.cpp @@ -11,9 +11,9 @@ namespace { template -inline void execute_single_level(const armral_fft_plan_t *p, const Tx *x, Ty *y, - int istride, int ostride, int howmany, - int idist, int odist) { +inline void execute_single_level(const armral::fft::plan_1d_t *p, const Tx *x, + Ty *y, int istride, int idist, int ostride, + int odist, int howmany) { auto *lev = static_cast *>(p->levels[0]); assert(lev->how_many == 1); assert(lev->n2 == 1); @@ -25,7 +25,15 @@ inline void execute_single_level(const armral_fft_plan_t *p, const Tx *x, Ty *y, nullptr, howmany, idist, odist); } else { assert(lev->kernel); - lev->kernel(x, y, istride, ostride, howmany, p->dir); + // Make use of uu kernels when the distance between transform input/output + // domains is 1 + if (idist == 1 && odist == 1) { + lev->kernel(x, y, istride, ostride, howmany, p->dir); + } else { + for (int i = 0; i < howmany; i++) { + lev->kernel(x + i * idist, y + i * odist, istride, ostride, 1, p->dir); + } + } } } @@ -46,22 +54,14 @@ inline void execute_dit(const armral::fft::lev_base_t *lev, const Tx *x, Ty *y, level->kernel(x, y, n1_istride, n1_ostride, level->n2 * level->how_many, lev->dir); } else { - if constexpr (std::is_same_v && std::is_same_v) { - assert(level->ac_gu_kernel); - // TODO_KB: I think that this is right, in terms of the idist = - // n2_istride. May need to tweak this a little - level->ac_gu_kernel(x, y, n1_istride, n1_ostride, - level->n2 * level->how_many, istride, lev->dir); - } else { - // We should not be able to get into this branch, as we expect input, - // output and working type to be the same - assert(false); - } + assert(level->ac_gu_kernel); + level->ac_gu_kernel(x, y, n1_istride, n1_ostride, + level->n2 * level->how_many, istride, lev->dir); } } else { // Rader's or Bluestein's for (int hm = 0; hm != level->how_many; ++hm) { - const Tx *x_ptr = &x[hm]; + const Tx *x_ptr = &x[hm * istride]; Ty *y_ptr = &y[hm]; if (level->r) { armral::fft::execute_rader( @@ -106,7 +106,7 @@ inline void execute_dit_ab_twid(const armral::fft::lev_base_t *lev, const Tx *x, } else { for (int hm = 0; hm < level->how_many; ++hm) { const Tx *x_ptr = &x[hm]; - Ty *y_ptr = &y[hm]; + Ty *y_ptr = &y[hm * ostride]; if (level->r) { armral::fft::execute_rader( level->r, x_ptr, y_ptr, n1_istride, n1_ostride, level->twids, @@ -169,8 +169,8 @@ inline void execute_dit_ac_twid(const armral::fft::lev_base_t *lev, const Tx *x, namespace armral::fft { template -armral_status execute(const armral_fft_plan_t *p, const Tx *x, Ty *y, - int istride, int ostride, int howmany) { +armral_status execute_1d(const plan_1d_t *p, const Tx *x, Ty *y, int istride, + int idist, int ostride, int odist, int howmany) { static_assert(sizeof(Tw) >= sizeof(Tx) && sizeof(Tw) >= sizeof(Ty)); if (p == nullptr) { assert(false && "Plan is invalid"); @@ -182,7 +182,8 @@ armral_status execute(const armral_fft_plan_t *p, const Tx *x, Ty *y, // shortcut if we only have one level, no need to use temporary buffers etc. // We assume that howmany are unit stride apart in both the input and output if (num_levels == 1) { - execute_single_level(p, x, y, istride, ostride, howmany, 1, 1); + execute_single_level(p, x, y, istride, idist, ostride, odist, + howmany); return ARMRAL_SUCCESS; } @@ -192,7 +193,8 @@ armral_status execute(const armral_fft_plan_t *p, const Tx *x, Ty *y, // actually do the computation! for (int h = 0; h < howmany; ++h) { - execute_dit(levs[0], &x[h], tmp1, istride, 1); // x -> tmp + execute_dit(levs[0], &x[h * idist], tmp1, istride, + 1); // x -> tmp for (int i = 1; i < num_levels - 1; ++i) { Tw *t1 = i % 2 == 0 ? tmp2 : tmp1; Tw *t2 = i % 2 == 0 ? tmp1 : tmp2; @@ -201,20 +203,53 @@ armral_status execute(const armral_fft_plan_t *p, const Tx *x, Ty *y, } auto *t1 = num_levels % 2 == 0 ? tmp1 : tmp2; // tmp -> y - execute_dit_ab_twid(levs[num_levels - 1], t1, &y[h], 1, + execute_dit_ab_twid(levs[num_levels - 1], t1, &y[h * odist], 1, ostride); } return ARMRAL_SUCCESS; } +template armral_status +execute_1d( + const plan_1d_t *p, const armral_cmplx_f32_t *x, armral_cmplx_f32_t *y, + int istride, int idist, int ostride, int odist, int howmany); + +template armral_status +execute_1d( + const plan_1d_t *p, const armral_cmplx_int16_t *x, armral_cmplx_int16_t *y, + int istride, int idist, int ostride, int odist, int howmany); + +template +armral_status execute(const armral_fft_plan_t *p, const Tx *x, Ty *y) { + + if (p->ndims == 1) { + return execute_1d(p->plans[0], x, y, 1, 0, 1, 0, 1); + } + + if (p->ndims == 2) { + auto n0 = p->plans[0]->n; + auto n1 = p->plans[1]->n; + // n0 transforms of length n1. 2-d FFTs are row-major (dim of length n1) + // so these are transforms in the contiguous dimension. + auto stat = execute_1d(p->plans[1], x, y, 1, n1, 1, n1, n0); + if (stat != ARMRAL_SUCCESS) { + return stat; + } + // n1 transforms of length n0. This is the strided dimension. + return execute_1d(p->plans[0], y, y, n1, 1, n1, 1, n1); + } + + return ARMRAL_ARGUMENT_ERROR; +} + template armral_status execute( const armral_fft_plan_t *p, const armral_cmplx_f32_t *x, - armral_cmplx_f32_t *y, int istride, int ostride, int howmany); + armral_cmplx_f32_t *y); template armral_status execute( const armral_fft_plan_t *p, const armral_cmplx_int16_t *x, - armral_cmplx_int16_t *y, int istride, int ostride, int howmany); + armral_cmplx_int16_t *y); } // namespace armral::fft diff --git a/src/LowerPHY/FFT/fft_execute.hpp b/src/LowerPHY/FFT/fft_execute.hpp index 6f410b8c9e9829e97fc7a5a9e7ac1ca57118f6b4..8b735e19246566a7f3a84f9ff6bcfc6d131260e6 100644 --- a/src/LowerPHY/FFT/fft_execute.hpp +++ b/src/LowerPHY/FFT/fft_execute.hpp @@ -7,11 +7,15 @@ #pragma once #include "armral.h" +#include "fft_plan.hpp" namespace armral::fft { template -armral_status execute(const armral_fft_plan_t *p, const Tx *x, Ty *y, - int istride, int ostride, int howmany); +armral_status execute_1d(const plan_1d_t *p, const Tx *x, Ty *y, int istride, + int idist, int ostride, int odist, int howmany); + +template +armral_status execute(const armral_fft_plan_t *p, const Tx *x, Ty *y); } // namespace armral::fft diff --git a/src/LowerPHY/FFT/fft_level.hpp b/src/LowerPHY/FFT/fft_level.hpp index 0015278233f4471a4d27e3b9f6841895be340545..d3dfec7c7f767a46a6f198fbf3476fae66c5f37e 100644 --- a/src/LowerPHY/FFT/fft_level.hpp +++ b/src/LowerPHY/FFT/fft_level.hpp @@ -58,7 +58,7 @@ struct lev_t : public lev_base_t { // we perform a Rader's, and decompose into a 2D problem. At this point we are // already in the working type, so we have input, output and working type all // the same - fft_ac_gu_func_t ac_gu_kernel; + fft_ac_gu_func_t ac_gu_kernel; fft_ab_twid_gu_func_t ab_twid_gu_kernel; fft_ab_twid_gs_func_t ab_twid_gs_kernel; fft_ac_twid_func_t ac_twid_kernel; @@ -70,7 +70,7 @@ struct lev_t : public lev_base_t { lev_t(int n_in, int n1_in, int n2_in, int how_many_in, armral_fft_direction_t dir_in, Tw *twids_in, fft_ac_uu_func_t kernel_in, - fft_ac_gu_func_t ac_gu_kernel_in, + fft_ac_gu_func_t ac_gu_kernel_in, fft_ab_twid_gu_func_t ab_twid_gu_kernel_in, fft_ab_twid_gs_func_t ab_twid_gs_kernel_in, fft_ac_twid_func_t ac_twid_kernel_in, diff --git a/src/LowerPHY/FFT/fft_plan.cpp b/src/LowerPHY/FFT/fft_plan.cpp index 96f8f1c7a8547352e9d08c5936b579e5a80fa387..9466ec461e4491b07a151bb5f0b751b1cb37ba55 100644 --- a/src/LowerPHY/FFT/fft_plan.cpp +++ b/src/LowerPHY/FFT/fft_plan.cpp @@ -101,8 +101,10 @@ inline armral::fft::fft_ac_uu_func_t get_base_kernel( int n, armral_fft_direction_t dir, bool want_uun) { - assert(want_uun); - return lookup_ac_uun_base_kernel_cs16_cs16(n, dir); + if (want_uun) { + return lookup_ac_uun_base_kernel_cs16_cs16(n, dir); + } + return lookup_ac_uu_base_kernel_cs16_cs16(n, dir); } template<> @@ -123,7 +125,9 @@ get_base_kernel( template inline armral::fft::fft_ac_gu_func_t -get_ac_gu_base_kernel(int n, armral_fft_direction_t dir); +get_ac_gu_base_kernel(int n, armral_fft_direction_t dir) { + return nullptr; +} template<> inline armral::fft::fft_ac_gu_func_t +inline armral::fft::fft_ac_gu_func_t +get_ac_gu_base_kernel(int n, armral_fft_direction_t dir) { + return lookup_ac_gu_base_kernel_cs16_cf32(n, dir); +} + template inline armral::fft::fft_ab_twid_gu_func_t get_ab_twiddle_gu_base_kernel(int n, armral_fft_direction_t dir) { @@ -172,6 +184,15 @@ get_ab_twiddle_gs_base_kernel +inline armral::fft::fft_ab_twid_gs_func_t< + armral_cmplx_f32_t, armral_cmplx_int16_t, armral_cmplx_f32_t> +get_ab_twiddle_gs_base_kernel(int n, + armral_fft_direction_t dir) { + return lookup_ab_twiddle_gs_base_kernel_cf32_cs16(n, dir); +} + template inline armral::fft::fft_ac_twid_func_t get_ac_twiddle_base_kernel(int n, armral_fft_direction_t dir) { @@ -190,7 +211,7 @@ get_ac_twiddle_base_kernel struct kernel_selection { armral::fft::fft_ac_uu_func_t base_kernel; - armral::fft::fft_ac_gu_func_t ac_gu_kernel; + armral::fft::fft_ac_gu_func_t ac_gu_kernel; armral::fft::fft_ab_twid_gu_func_t ab_twid_gu_kernel; armral::fft::fft_ab_twid_gs_func_t ab_twid_gs_kernel; armral::fft::fft_ac_twid_func_t ac_twid_kernel; @@ -208,9 +229,7 @@ template kernel_selection get_kernels(int n1, armral_fft_direction_t dir, bool want_twids, bool want_uun) { auto kernel = get_base_kernel(n1, dir, want_uun); - auto ac_gu_kernel = (std::is_same_v && std::is_same_v) - ? get_ac_gu_base_kernel(n1, dir) - : nullptr; + auto ac_gu_kernel = get_ac_gu_base_kernel(n1, dir); auto ab_twid_gu_kernel = want_twids ? get_ab_twiddle_gu_base_kernel(n1, dir) : nullptr; auto ab_twid_gs_kernel = @@ -305,7 +324,7 @@ template armral::fft::lev_base_t * make_level_data(int n, int n1, int n2, int how_many, armral_fft_direction_t dir, bool want_twiddles, bool want_ac, bool allow_raders, - bool use_all_kernels, bool want_uun) { + bool use_all_kernels, bool want_uun, bool is2d) { using level_type = armral::fft::lev_t; if (kernel_exists(n1, use_all_kernels)) { auto [kernel, ac_gu_kernel, ab_twid_gu_kernel, ab_twid_gs_kernel, @@ -327,7 +346,8 @@ make_level_data(int n, int n1, int n2, int how_many, armral_fft_direction_t dir, return nullptr; } Tw *twids = want_twiddles ? make_twiddles(n1, n2, dir, 1, true) : nullptr; - auto maybe_r = armral::fft::make_rader(n1, dir, n); + auto maybe_r = + armral::fft::make_rader(n1, dir, n, want_uun && !is2d); if (maybe_r) { auto r = std::move(*maybe_r); if (r.n == 0) { @@ -352,7 +372,7 @@ make_level_data(int n, int n1, int n2, int how_many, armral_fft_direction_t dir, template int factorize(int n, armral_fft_direction_t dir, int max_levels, armral::fft::lev_base_t **levels, bool allow_raders, - bool use_all_kernels, bool want_uun) { + bool use_all_kernels, bool want_uun, bool is2d) { // Search through the set of supported factors to find a suitable // factorization, then use that to build the level data structures. int factors[max_levels]; @@ -374,31 +394,36 @@ int factorize(int n, armral_fft_direction_t dir, int max_levels, if (num_factors == 1) { // Operating on a single level - input output and working types are as // specified for this function - levels[fi] = make_level_data(n, n1, n2, how_many, dir, - false, false, allow_raders, - use_all_kernels, want_uun); + levels[fi] = make_level_data( + n, n1, n2, how_many, dir, false, false, allow_raders, + use_all_kernels, want_uun, is2d); } else { // We have multiple levels, and are currently dealing with the first // level. Transform data to the working type from the input type - levels[fi] = - make_level_data(n, n1, n2, how_many, dir, false, false, - allow_raders, use_all_kernels, false); + levels[fi] = make_level_data(n, n1, n2, how_many, dir, + false, false, allow_raders, + use_all_kernels, false, is2d); } } else if (fi == num_factors - 1) { // We have multiple levels and are currently dealing with the last level. // Transform data from the working type to the output type - levels[fi] = - make_level_data(n, n1, n2, how_many, dir, true, false, - allow_raders, use_all_kernels, false); + levels[fi] = make_level_data(n, n1, n2, how_many, dir, true, + false, allow_raders, + use_all_kernels, false, is2d); } else { // We have multiple levels and are currently dealing with an intermediate // level (i.e. not first or last). All work is done in the working type - levels[fi] = - make_level_data(n, n1, n2, how_many, dir, true, true, - allow_raders, use_all_kernels, false); + levels[fi] = make_level_data(n, n1, n2, how_many, dir, true, + true, allow_raders, + use_all_kernels, false, is2d); } if (!levels[fi]) { + // Delete all previous levels and abort - this is required for cleanly + // aborting Raders plans and falling back to Bluestein + for (int j = 0; j < fi; j++) { + delete levels[j]; + } return 0; } } @@ -410,42 +435,147 @@ int factorize(int n, armral_fft_direction_t dir, int max_levels, namespace armral::fft { template -armral_status create_plan(armral_fft_plan_t **p, int n, - armral_fft_direction_t dir, bool allow_raders, - bool use_all_kernels, bool want_uun) { +armral_status create_plan_1d_internal(plan_1d_t **p, int n, + armral_fft_direction_t dir, + bool allow_raders, bool use_all_kernels, + bool want_uun, bool is2d) { if (n > 42012) { // This length is currently unsupported due to the limit on the number of - // allowed factors/levels, which is defined by armral_fft_plan_t::max_levels + // allowed factors/levels, which is defined by + // plan_1d_t::max_levels + return ARMRAL_ARGUMENT_ERROR; + } + + if (!p) { return ARMRAL_ARGUMENT_ERROR; } - assert(p); + // Try and find a suitable decomposition, else give up. - armral_fft_plan_t tmp_plan = {}; - tmp_plan.n = n; - tmp_plan.dir = dir; - tmp_plan.num_levels = factorize( - n, dir, armral_fft_plan_t::max_levels, tmp_plan.levels, allow_raders, - use_all_kernels, want_uun); - if (tmp_plan.num_levels == 0) { + plan_1d_t tmp_1d_plan = {}; + tmp_1d_plan.n = n; + tmp_1d_plan.dir = dir; + tmp_1d_plan.num_levels = + factorize(n, dir, plan_1d_t::max_levels, tmp_1d_plan.levels, + allow_raders, use_all_kernels, want_uun, is2d); + if (tmp_1d_plan.num_levels == 0) { + return ARMRAL_ARGUMENT_ERROR; + } + + // Only allocate once we're sure we actually have a plan to return. + *p = static_cast(malloc(sizeof(plan_1d_t))); + memcpy(*p, &tmp_1d_plan, sizeof(plan_1d_t)); + + return ARMRAL_SUCCESS; +} + +template armral_status +create_plan_1d_internal(plan_1d_t **p, int n, + armral_fft_direction_t dir, + bool allow_raders, + bool use_all_kernels, bool want_uun, + bool is2d); +template armral_status +create_plan_1d_internal(plan_1d_t **p, int n, + armral_fft_direction_t dir, + bool allow_raders, + bool use_all_kernels, bool want_uun, + bool is2d); + +template +armral_status create_plan_1d(armral_fft_plan_t **p, int n, + armral_fft_direction_t dir, bool allow_raders, + bool use_all_kernels, bool want_uun) { + if (!p) { return ARMRAL_ARGUMENT_ERROR; } + // Create the 1-d plan + plan_1d_t *p0; + auto stat = create_plan_1d_internal(&p0, n, dir, allow_raders, + use_all_kernels, want_uun); + if (stat != ARMRAL_SUCCESS) { + return ARMRAL_ARGUMENT_ERROR; + } + + // Create the overall plan to return + armral_fft_plan_t tmp_plan = {}; + tmp_plan.ndims = 1; + tmp_plan.plans[0] = p0; + tmp_plan.plans[1] = NULL; + // Only allocate once we're sure we actually have a plan to return. *p = static_cast(malloc(sizeof(armral_fft_plan_t))); memcpy(*p, &tmp_plan, sizeof(armral_fft_plan_t)); + return ARMRAL_SUCCESS; } template armral_status -create_plan( +create_plan_1d( armral_fft_plan_t **p, int n, armral_fft_direction_t dir, bool allow_raders, bool use_all_kernels, bool want_uun); template armral_status -create_plan( +create_plan_1d( armral_fft_plan_t **p, int n, armral_fft_direction_t dir, bool allow_raders, bool use_all_kernels, bool want_uun); -armral_status destroy_plan(armral_fft_plan_t **p) { +template +armral_status create_plan_2d(armral_fft_plan_t **p, int n0, int n1, + armral_fft_direction_t dir, bool allow_raders, + bool use_all_kernels, bool want_uun) { + if (!p) { + return ARMRAL_ARGUMENT_ERROR; + } + + // Create as 1-d plan if one of the dims is 1 + auto min_n = n0 > n1 ? n1 : n0; + auto max_n = n0 > n1 ? n0 : n1; + if (min_n == 1) { + return create_plan_1d(p, max_n, dir, allow_raders, + use_all_kernels, want_uun); + } + + // Create 2 1-d plans + + plan_1d_t *p0; + auto stat = create_plan_1d_internal(&p0, n0, dir, allow_raders, + use_all_kernels, false, true); + if (stat != ARMRAL_SUCCESS) { + return ARMRAL_ARGUMENT_ERROR; + } + + plan_1d_t *p1; + stat = create_plan_1d_internal(&p1, n1, dir, allow_raders, + use_all_kernels, want_uun, true); + if (stat != ARMRAL_SUCCESS) { + return ARMRAL_ARGUMENT_ERROR; + } + + // Create the overall plan to return + armral_fft_plan_t tmp_plan = {}; + tmp_plan.ndims = 2; + tmp_plan.plans[0] = p0; + tmp_plan.plans[1] = p1; + + // Only allocate once we're sure we actually have a plan to return. + *p = static_cast(malloc(sizeof(armral_fft_plan_t))); + memcpy(*p, &tmp_plan, sizeof(armral_fft_plan_t)); + + return ARMRAL_SUCCESS; +} + +template armral_status +create_plan_2d( + armral_fft_plan_t **p, int n0, int n1, armral_fft_direction_t dir, + bool allow_raders, bool use_all_kernels, bool want_uun); +template armral_status +create_plan_2d( + armral_fft_plan_t **p, int n0, int n1, armral_fft_direction_t dir, + bool allow_raders, bool use_all_kernels, bool want_uun); + +armral_status destroy_plan(plan_1d_t **p) { if (p == nullptr || *p == nullptr) { assert(false && "Invalid plan"); return ARMRAL_ARGUMENT_ERROR; @@ -461,4 +591,21 @@ armral_status destroy_plan(armral_fft_plan_t **p) { return ARMRAL_SUCCESS; } +armral_status destroy_plan(armral_fft_plan_t **p) { + if (p == nullptr || *p == nullptr) { + assert(false && "Invalid plan"); + return ARMRAL_ARGUMENT_ERROR; + } + for (int i = 0; i < (*p)->ndims; ++i) { + assert((*p)->plans[i]); + auto stat = destroy_plan(&((*p)->plans[i])); + if (stat != ARMRAL_SUCCESS) { + return ARMRAL_ARGUMENT_ERROR; + } + } + free(*p); + *p = NULL; + return ARMRAL_SUCCESS; +} + } // namespace armral::fft diff --git a/src/LowerPHY/FFT/fft_plan.hpp b/src/LowerPHY/FFT/fft_plan.hpp index 74cca316820b39b5ceb7a2221058555428fe6d66..496dfb524878035dc68ccc01338d209199b4abbc 100644 --- a/src/LowerPHY/FFT/fft_plan.hpp +++ b/src/LowerPHY/FFT/fft_plan.hpp @@ -7,28 +7,82 @@ #pragma once #include "armral.h" + +// This internal plan type is referenced in headers included from fft_level.hpp +namespace armral::fft { +struct plan_1d_t; +}; + #include "fft_level.hpp" namespace armral::fft { /** - * Creates a plan for solving FFTs. Depending on the data type, the + * Creates a plan for solving rank 1 FFTs. Depending on the data type, the + * plan will execute different functions. + * @tparam Tx Input data type + * @tparam Ty Output data type + * @tparam Tw Working data type + * @param [out] p Pointer to populate with the created FFT plan. + * @param [in] n The overall size of the FFT to perform. + * @param [in] dir The direction of the FFT (forwards or + * backwards). + * @param [in] allow_raders Allow use of Rader's algorithm. + * @param [in] use_all_kernels Allow use of all available kernels. Default is + * false. + * @returns ARMRAL_SUCCESS if a plan is successfully created. + */ +template +armral_status create_plan_1d(armral_fft_plan_t **p, int n, + armral_fft_direction_t dir, bool allow_raders, + bool use_all_kernels = false, + bool want_uun = true); + +/** + * Creates a plan for solving rank 2 FFTs. Depending on the data type, the * plan will execute different functions. * @tparam Tx Input data type * @tparam Ty Output data type * @tparam Tw Working data type * @param [out] p Pointer to populate with the created FFT plan. + * @param [in] n0 The size of the 0th dimenion to be solved by + * this FFT plan. + * @param [in] n1 The size of the 1st dimenion to be solved by + * this FFT plan. + * @param [in] dir The direction of the FFT (forwards or + * backwards). + * @param [in] allow_raders Allow use of Rader's algorithm. + * @param [in] use_all_kernels Allow use of all available kernels. Default is + * false. + * @returns ARMRAL_SUCCESS if a plan is successfully created. + */ +template +armral_status create_plan_2d(armral_fft_plan_t **p, int n0, int n1, + armral_fft_direction_t dir, bool allow_raders, + bool use_all_kernels = false, + bool want_uun = true); + +/** + * Creates an internal plan for solving rank 1 FFTs. Depending on the data type, + * the plan will execute different functions. + * @tparam Tx Input data type + * @tparam Ty Output data type + * @tparam Tw Working data type + * @param [out] p Pointer to populate with the created FFT plan. * @param [in] n The overall size of the FFT to perform. * @param [in] dir The direction of the FFT (forwards or * backwards). * @param [in] allow_raders Allow use of Rader's algorithm. * @param [in] use_all_kernels Allow use of all available kernels. Default is * false. + * @param [in] is2d Indicates whether the plan is part of an overall + * 2-d plan. Default is false. * @returns ARMRAL_SUCCESS if a plan is successfully created. */ template -armral_status create_plan(armral_fft_plan_t **p, int n, - armral_fft_direction_t dir, bool allow_raders, - bool use_all_kernels = false, bool want_uun = true); +armral_status +create_plan_1d_internal(plan_1d_t **p, int n, armral_fft_direction_t dir, + bool allow_raders, bool use_all_kernels = false, + bool want_uun = true, bool is2d = false); /** * Common code for destroying a plan. For the time being, the plan is identical @@ -39,20 +93,21 @@ armral_status create_plan(armral_fft_plan_t **p, int n, */ armral_status destroy_plan(armral_fft_plan_t **p); +armral_status destroy_plan(plan_1d_t **p); + // Forward declaration of lev_base_t because of circular dependence // plan has levels // levels has rader // rader has plans struct lev_base_t; -} // namespace armral::fft /** - * Structure encapsulating a series of steps to solve an FFT of a + * Structure encapsulating a series of steps to solve a 1-d FFT of a * particular length. This must be built ahead of execution since the * process of building the plan is potentially time consuming (in * particular, constructing twiddle factors). */ -struct armral_fft_plan_t { +struct plan_1d_t { static constexpr int max_levels = 5; /// The problem size being solved. int n; @@ -61,5 +116,17 @@ struct armral_fft_plan_t { /// The number of composite factors involved in the solve. int num_levels; /// Information required to solve the FFT at each level - armral::fft::lev_base_t *levels[max_levels]; + lev_base_t *levels[max_levels]; +}; + +} // namespace armral::fft + +/** + * Structure encapsulating an array of 1-d plans; the type + * retured to the user as an opaque pointer. + */ +struct armral_fft_plan_t { + static constexpr int max_dims = 2; + int ndims; + armral::fft::plan_1d_t *plans[max_dims]; }; diff --git a/src/LowerPHY/FFT/rader.cpp b/src/LowerPHY/FFT/rader.cpp index a91099bb3f573e39758a610023b745904481358b..391fa7cd2d6fa72ad2b549de44ad00845cb1e171 100644 --- a/src/LowerPHY/FFT/rader.cpp +++ b/src/LowerPHY/FFT/rader.cpp @@ -19,7 +19,7 @@ namespace armral::fft { template std::optional> make_rader(int n, armral_fft_direction_t dir, - int n_whole) { + int n_whole, bool want_uun) { using real_t = armral::fft::real_t; auto g = find_group_generator(n); @@ -30,19 +30,19 @@ std::optional> make_rader(int n, armral_fft_direction_t dir, // try to plan recursive calls, but do not allow recursive use of Rader's // algorithm since that tends to be slower than just using Bluestein. - armral_fft_plan_t *pf = nullptr; - armral_fft_plan_t *pb = nullptr; + plan_1d_t *pf = nullptr; + plan_1d_t *pb = nullptr; // We get a performance benefit from using additional kernels provided the n // we are creating a Rader's plan for isn't the only factor of n_whole. bool use_all_kernels = n_whole > n; // Only allow uun kernels to be used if we know the plans will be executed // with howmany = 1. This will be the case if the level that the Rader's plan // is being created for has n2 = 1, i.e. if n = n_whole. - bool want_uun = n == n_whole; - armral::fft::create_plan( + want_uun = want_uun && n == n_whole; + armral::fft::create_plan_1d_internal( &pf, n - 1, armral_fft_direction_t::ARMRAL_FFT_FORWARDS, false, use_all_kernels, want_uun); - armral::fft::create_plan( + armral::fft::create_plan_1d_internal( &pb, n - 1, armral_fft_direction_t::ARMRAL_FFT_BACKWARDS, false, use_all_kernels, want_uun); if (!pf || !pb) { @@ -78,7 +78,7 @@ std::optional> make_rader(int n, armral_fft_direction_t dir, double in = ((2. * M_PI * x) / n) * dir_float; b[i] = Tw{(real_t)cos(in), (real_t)sin(in)}; } - armral::fft::execute(pf, b, b, 1, 1, 1); + armral::fft::execute_1d(pf, b, b, 1, 0, 1, 0, 1); // Multiply output from FFT of b with 1/n_pad real_t n1 = 1.0 / (n - 1); @@ -101,16 +101,16 @@ std::optional> make_rader(int n, armral_fft_direction_t dir, template std::optional< rader> -make_rader(int n, armral_fft_direction_t dir, int n_whole); +make_rader(int n, armral_fft_direction_t dir, int n_whole, bool want_uun); template std::optional< rader> -make_rader(int n, armral_fft_direction_t dir, int n_whole); +make_rader(int n, armral_fft_direction_t dir, int n_whole, bool want_uun); template std::optional< rader> -make_rader(int n, armral_fft_direction_t dir, int n_whole); +make_rader(int n, armral_fft_direction_t dir, int n_whole, bool want_uun); template std::optional< rader> -make_rader(int n, armral_fft_direction_t dir, int n_whole); +make_rader(int n, armral_fft_direction_t dir, int n_whole, bool want_uun); template struct rader; @@ -197,8 +197,8 @@ void execute_rader(const rader &r, const Tx *x, Ty *y, int istride, #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif - armral::fft::execute(r.pf, work_ptr, work_ptr, howmany, howmany, - howmany); + armral::fft::execute_1d(r.pf, work_ptr, work_ptr, howmany, 1, + howmany, 1, howmany); #ifndef __clang__ #pragma GCC diagnostic pop #endif @@ -222,8 +222,8 @@ void execute_rader(const rader &r, const Tx *x, Ty *y, int istride, #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif - armral::fft::execute(r.pb, work_ptr, work_ptr, howmany, howmany, - howmany); + armral::fft::execute_1d(r.pb, work_ptr, work_ptr, howmany, 1, + howmany, 1, howmany); #ifndef __clang__ #pragma GCC diagnostic pop #endif diff --git a/src/LowerPHY/FFT/rader.hpp b/src/LowerPHY/FFT/rader.hpp index c26f124eeb5d79582d7444552513dac814ac5064..ae3bd449598c58247eebf22d8c2a451ce9ac7793 100644 --- a/src/LowerPHY/FFT/rader.hpp +++ b/src/LowerPHY/FFT/rader.hpp @@ -31,9 +31,9 @@ struct rader { int g_inv; /// Plan for the forward part of a sub-fft - armral_fft_plan_t *pf; + plan_1d_t *pf; /// Plan for the backward part of a sub-fft - armral_fft_plan_t *pb; + plan_1d_t *pb; const Tw *b; @@ -70,10 +70,9 @@ struct rader { rader &operator=(const rader &) = delete; rader &operator=(rader &&) = delete; - rader(int n_in, int g_in, int g_inv_in, armral_fft_plan_t *pf_in, - armral_fft_plan_t *pb_in, const Tw *b_in, const int *gmul_fw_in, - const int *gmul_bw_in, const int *ginvmul_fw_in, - const int *ginvmul_bw_in) + rader(int n_in, int g_in, int g_inv_in, plan_1d_t *pf_in, plan_1d_t *pb_in, + const Tw *b_in, const int *gmul_fw_in, const int *gmul_bw_in, + const int *ginvmul_fw_in, const int *ginvmul_bw_in) : n(n_in), g(g_in), g_inv(g_inv_in), pf(pf_in), pb(pb_in), b(b_in), gmul_fw_perm(gmul_fw_in), gmul_bw_perm(gmul_bw_in), ginvmul_fw_perm(ginvmul_fw_in), ginvmul_bw_perm(ginvmul_bw_in) {} @@ -87,7 +86,7 @@ struct rader { template std::optional> make_rader(int n, armral_fft_direction_t dir, - int n_whole); + int n_whole, bool want_uun); template void execute_rader(const rader &r, const Tx *x, Ty *y, int istride, diff --git a/src/UpperPHY/LDPC/arm_ldpc_decoder.cpp b/src/UpperPHY/LDPC/arm_ldpc_decoder.cpp index a7e1d7ea1a58569bc41e541cafb6770037bde826..eabb719a4f620ef386dbbb9a7ec883f9b6f56573 100644 --- a/src/UpperPHY/LDPC/arm_ldpc_decoder.cpp +++ b/src/UpperPHY/LDPC/arm_ldpc_decoder.cpp @@ -1637,11 +1637,66 @@ inline void load_ptr_l(int16_t *ptr_l, const int8_t *llrs_ptr, uint32_t len_in, } } +enum class crc_options { CRC16, CRC24A, CRC24B, NONE }; + +std::optional parse_crc_option(uint32_t options) { + uint32_t mask = ARMRAL_LDPC_CRC_NO | ARMRAL_LDPC_CRC_16 | + ARMRAL_LDPC_CRC_24A | ARMRAL_LDPC_CRC_24B; + uint32_t option = options & mask; + + switch (option) { + case ARMRAL_LDPC_CRC_16: + return std::nullopt; // Not supported + case ARMRAL_LDPC_CRC_24A: + return std::nullopt; // Not supported + case ARMRAL_LDPC_CRC_24B: + return crc_options::CRC24B; + case ARMRAL_LDPC_CRC_NO: + return crc_options::NONE; + case 0: // Nothing set + return crc_options::NONE; + } + return std::nullopt; +} + +enum class iter_options { EVERY, END, NEVER }; + +std::optional parse_iter_option(uint32_t options) { + uint32_t mask = ARMRAL_LDPC_CRC_EVERY_ITER | ARMRAL_LDPC_CRC_END_ITER; + uint32_t option = options & mask; + + switch (option) { + case ARMRAL_LDPC_CRC_EVERY_ITER: + return iter_options::EVERY; + case ARMRAL_LDPC_CRC_END_ITER: + return iter_options::END; + case 0: // Nothing set + return iter_options::EVERY; + } + return std::nullopt; +} + +std::optional parse_implicit_bits_option(uint32_t options) { + uint32_t mask = + ARMRAL_LDPC_FILLER_BITS_IMPLICIT | ARMRAL_LDPC_FILLER_BITS_EXPLICIT; + uint32_t option = options & mask; + + switch (option) { + case ARMRAL_LDPC_FILLER_BITS_IMPLICIT: + return true; + case ARMRAL_LDPC_FILLER_BITS_EXPLICIT: + return false; + case 0: // User default + return false; + } + return std::nullopt; +} + template -bool decode_block(uint32_t n, const int8_t *llrs, armral_ldpc_graph_t bg, - uint32_t z, uint32_t len_filler_bits, uint8_t *data_out, - uint32_t max_its, armral_ldpc_decode_options_t options, - Allocator &allocator) { +armral_status +decode_block(uint32_t n, const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z, + uint32_t len_filler_bits, uint8_t *data_out, uint32_t max_its, + uint32_t options, Allocator &allocator) { bool crc_passed = false; @@ -1677,8 +1732,27 @@ bool decode_block(uint32_t n, const int8_t *llrs, armral_ldpc_graph_t bg, // NOTE: All allocations are now done! if constexpr (Allocator::is_counting) { - return false; + return ARMRAL_SUCCESS; + } + + std::optional maybe_crc_option = parse_crc_option(options); + if (!maybe_crc_option) { + return ARMRAL_ARGUMENT_ERROR; + } + auto crc_option = *maybe_crc_option; + + std::optional maybe_iter_option = parse_iter_option(options); + if (!maybe_iter_option) { + return ARMRAL_ARGUMENT_ERROR; + } + auto iter_option = crc_option == crc_options::NONE ? iter_options::NEVER + : *maybe_iter_option; + + std::optional maybe_implicit_bits = parse_implicit_bits_option(options); + if (!maybe_implicit_bits) { + return ARMRAL_ARGUMENT_ERROR; } + auto implicit_bits = *maybe_implicit_bits; uint32_t r_index = 0; uint32_t no_of_rows = graph->nrows; @@ -1696,9 +1770,7 @@ bool decode_block(uint32_t n, const int8_t *llrs, armral_ldpc_graph_t bg, memset(ptr_l, 0, sizeof(int16_t) * 2 * z); ptr_l = ptr_l + 2 * z; - load_ptr_l(ptr_l, llrs_ptr, n, - ((options & (1 << 4)) == ARMRAL_LDPC_FILLER_BITS_IMPLICIT) * - len_filler_bits, + load_ptr_l(ptr_l, llrs_ptr, n, implicit_bits ? len_filler_bits : 0, (bg == 0) ? (z * 20) : (z * 8)); uint32_t full_blk = z_len / num_lanes; @@ -1747,7 +1819,7 @@ bool decode_block(uint32_t n, const int8_t *llrs, armral_ldpc_graph_t bg, } // decision and CRC at the iteration - if ((options & (1 << 2)) == ARMRAL_LDPC_CRC_EVERY_ITER) { + if (iter_option == iter_options::EVERY) { crc_passed = hard_decision(l.get(), crc_buff.get(), &data_out[0], graph->nmessage_bits * z - len_filler_bits, true); @@ -1758,7 +1830,7 @@ bool decode_block(uint32_t n, const int8_t *llrs, armral_ldpc_graph_t bg, } // do decision and CRC - if ((options & (2 << 2)) == ARMRAL_LDPC_CRC_END_ITER) { + if (iter_option == iter_options::END) { // ignore filler bits crc_passed = hard_decision(l.get(), crc_buff.get(), &data_out[0], @@ -1766,48 +1838,46 @@ bool decode_block(uint32_t n, const int8_t *llrs, armral_ldpc_graph_t bg, } // do only decisions - if ((options & 3) == ARMRAL_LDPC_CRC_NO) { + if (iter_option == iter_options::NEVER) { // ignore filler bits crc_passed = hard_decision(l.get(), crc_buff.get(), &data_out[0], graph->nmessage_bits * z - len_filler_bits, false); } - return crc_passed; + return crc_passed ? ARMRAL_SUCCESS : ARMRAL_FAIL; } } // namespace armral::ldpc -template bool armral::ldpc::decode_block( +template armral_status armral::ldpc::decode_block( uint32_t n, const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z, uint32_t len_filler_bits, uint8_t *data_out, uint32_t max_its, - armral_ldpc_decode_options_t options, heap_allocator &); + uint32_t options, heap_allocator &); -template bool armral::ldpc::decode_block( +template armral_status armral::ldpc::decode_block( uint32_t n, const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z, uint32_t len_filler_bits, uint8_t *data_out, uint32_t max_its, - armral_ldpc_decode_options_t options, buffer_bump_allocator &); + uint32_t options, buffer_bump_allocator &); armral_status armral_ldpc_decode_block(uint32_t n, const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z, uint32_t len_filler_bits, uint8_t *data_out, uint32_t max_its, - armral_ldpc_decode_options_t options) { + uint32_t options) { heap_allocator allocator{}; - bool result = armral::ldpc::decode_block( - n, llrs, bg, z, len_filler_bits, data_out, max_its, options, allocator); - return (result) ? ARMRAL_SUCCESS : ARMRAL_RESULT_FAIL; + return armral::ldpc::decode_block(n, llrs, bg, z, len_filler_bits, data_out, + max_its, options, allocator); } armral_status armral_ldpc_decode_block_noalloc( uint32_t n, const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z, uint32_t len_filler_bits, uint8_t *data_out, uint32_t max_its, - armral_ldpc_decode_options_t options, void *buffer) { + uint32_t options, void *buffer) { buffer_bump_allocator allocator{buffer}; - bool result = armral::ldpc::decode_block( - n, llrs, bg, z, len_filler_bits, data_out, max_its, options, allocator); - return (result) ? ARMRAL_SUCCESS : ARMRAL_RESULT_FAIL; + return armral::ldpc::decode_block(n, llrs, bg, z, len_filler_bits, data_out, + max_its, options, allocator); } uint32_t armral_ldpc_decode_block_noalloc_buffer_size(armral_ldpc_graph_t bg, diff --git a/src/UpperPHY/LDPC/arm_ldpc_encoder.cpp b/src/UpperPHY/LDPC/arm_ldpc_encoder.cpp index 749b4e3ffd60ffd454ecf3f7055a882ab6422b98..66d51311aca3f6118eaf7f218e5ec7900609f976 100644 --- a/src/UpperPHY/LDPC/arm_ldpc_encoder.cpp +++ b/src/UpperPHY/LDPC/arm_ldpc_encoder.cpp @@ -2167,64 +2167,7 @@ inline void spmv_hdsm(uint32_t z, uint32_t lsi, inline void copy_input_message(uint32_t z, const armral_ldpc_base_graph_t *graph, const uint8_t *bytes_in, uint8_t *codeword) { - -#if ARMRAL_ARCH_SVE - int32_t num_lanes = svcntb(); - svbool_t pg = svptrue_b8(); - int32_t full_vectors = z / num_lanes; - int32_t tail_size = z - (full_vectors * num_lanes); - - for (uint32_t j = 0; j < graph->nmessage_bits; ++j) { - uint8_t *out_ptr = codeword + j * z; - const uint8_t *in_ptr = bytes_in + j * z; - - for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) { - svuint8_t reg = svld1_u8(pg, in_ptr); - svst1_u8(pg, out_ptr, reg); - out_ptr += num_lanes; - in_ptr += num_lanes; - } - // Process tail - if (tail_size != 0) { - svbool_t pg_tail = svwhilelt_b8(0, tail_size); - svuint8_t reg = svld1_u8(pg_tail, in_ptr); - svst1_u8(pg_tail, out_ptr, reg); - } - } -#else - for (uint32_t j = 0; j < graph->nmessage_bits; ++j) { - - uint8_t *out_ptr = codeword + j * z; - const uint8_t *in_ptr = bytes_in + j * z; - - // Process 16 entries at a time - uint32_t blk_cnt = z >> 4U; - while (blk_cnt > 0U) { - // Load inputs - uint8x16_t reg = vld1q_u8(in_ptr); - // store result - vst1q_u8(out_ptr, reg); - blk_cnt--; - out_ptr += 16; - in_ptr += 16; - } - // Process a group of 8 elts - Tail - blk_cnt = z & 0xF; - while (blk_cnt > 7U) { - // Load inputs - uint8x8_t t8_reg = vld1_u8(in_ptr); - // store result - vst1_u8(out_ptr, t8_reg); - blk_cnt -= 8; - out_ptr += 8; - in_ptr += 8; - } - // Process tail - for (uint32_t zb = z - blk_cnt; zb < z; ++zb) { - codeword[j * z + zb] = bytes_in[j * z + zb]; - } - } -#endif + memcpy(codeword, bytes_in, z * graph->nmessage_bits); } inline void calc_hdsm_rhs(uint32_t z, const uint8_t *parity_hdsm, diff --git a/src/UpperPHY/LDPC/ldpc_coding.hpp b/src/UpperPHY/LDPC/ldpc_coding.hpp index aba68bae92251fb62c60541356e874a787364a58..03794659ae2e3d409f33faff18e722030e1260e0 100644 --- a/src/UpperPHY/LDPC/ldpc_coding.hpp +++ b/src/UpperPHY/LDPC/ldpc_coding.hpp @@ -15,9 +15,9 @@ constexpr uint32_t num_lifting_sets = 8; uint32_t get_lifting_index(uint32_t lifting_size); template -bool decode_block(uint32_t n, const int8_t *llrs, armral_ldpc_graph_t bg, - uint32_t z, uint32_t len_filler_bits, uint8_t *data_out, - uint32_t max_its, armral_ldpc_decode_options_t options, - Allocator &allocator); +armral_status +decode_block(uint32_t n, const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z, + uint32_t len_filler_bits, uint8_t *data_out, uint32_t max_its, + uint32_t options, Allocator &allocator); } // namespace armral::ldpc diff --git a/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp b/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp index b38aa7b419935788eada5695ce227ce09ede8a18..df7e6ea6cdf2e4c5aa869d935604e18a26a639ff 100644 --- a/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp +++ b/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp @@ -872,7 +872,7 @@ polar_frozen_mask_impl_repetition(uint32_t e, uint32_t k, uint8_t *frozen) { int min_weight = 0; uint32_t max_reliability = k + n_pc_wm - 1; while (min_weight < arrs->popcount[0] && - arrs->popcount_weight_ind[min_weight] > max_reliability) { + arrs->popcount_weight_ind[min_weight] >= max_reliability) { min_weight++; } uint16_t wm_idx = arrs->popcount_weight_ind[min_weight]; @@ -1030,7 +1030,7 @@ polar_frozen_mask_impl_puncturing(uint32_t e, uint32_t k, uint8_t *frozen) { // with the subtraction in rel_ind rel_ind -= 8; num_info_bits -= info_tmp; - for (; num_info_bits < k + n_pc_wm; ++rel_ind) { + for (; num_info_bits < k; ++rel_ind) { num_info_bits += (arrs->qf[rel_ind] >= e_limit && arrs->q[rel_ind] >= limit) ? 1 : 0; } @@ -1254,7 +1254,7 @@ polar_frozen_mask_impl_shortening(uint32_t e, uint32_t k, uint8_t *frozen) { // separately from the index pointing to all of the k + n_pc information bits rel_ind -= 8; num_info_bits -= info_tmp; - for (; num_info_bits < k + n_pc_wm; ++rel_ind) { + for (; num_info_bits < k; ++rel_ind) { num_info_bits += arrs->qf[rel_ind] < e ? 1 : 0; } uint32_t w_pc_ind = rel_ind; diff --git a/src/utils/allocators.hpp b/src/utils/allocators.hpp index f2868c8569f19337de0e7b39b7f5251c5cecb98b..dbbbbb4268bfc6e4ee06467a445ab16c60753b1a 100644 --- a/src/utils/allocators.hpp +++ b/src/utils/allocators.hpp @@ -20,6 +20,8 @@ class base_allocator { public: static constexpr bool is_counting = allocator_is_counting; + base_allocator() = default; + // Delete implicit copy/move constructors to prevent mistakes. base_allocator(base_allocator const &) = delete; base_allocator(base_allocator &&) = delete; @@ -31,9 +33,6 @@ public: Allocator::deallocate(ptr); } }; - -protected: - base_allocator() = default; }; class heap_allocator : public base_allocator { diff --git a/test/LowerPHY/FFT/FFT16/main.cpp b/test/LowerPHY/FFT/FFT16/main.cpp index fe0710570cdf388e29fb0bb87f3919efd6574ed9..dacc43f1319f29314013715485f2d53c05390498 100644 --- a/test/LowerPHY/FFT/FFT16/main.cpp +++ b/test/LowerPHY/FFT/FFT16/main.cpp @@ -7,7 +7,6 @@ #include "armral.h" #include "cf32_utils.hpp" #include "cs16_utils.hpp" -#include "fft_utils.hpp" #include "qint64.hpp" #include @@ -27,16 +26,22 @@ float clamp_neg1_to_1(float x) { } bool check_fft_results(const char *name, const armral_cmplx_int16_t *result, - const armral_cmplx_f32_t *expected, uint32_t n) { + const armral_cmplx_f32_t *expected, uint32_t n, + float tol = -1.0) { bool passed = true; float max_error = 0; - // check absolute tolerance against eps scaled by the problem size (since - // error will naturally grow as problem size increases). - float tol = FLT_EPSILON * (4 * n - 1); - // since the final result is rounded to Q0.15 format, this is also a - // potential source of large error (especially for smaller problem sizes). - tol = std::max((float32_t)2 / (1 << 15), tol); + // The rounding that happens when converting from fp32 to Q0.15 format + constexpr float32_t rounding = (float32_t)2 / (1 << 15); + + // If not provided, set the tolerance + if (tol < 0.0) { + // fp->int rounding from the first transform propagates into + // the second transform so scale by complexity, n. + // PLUS: 2 transforms in FLT involving 4 flops per complex mul, + // again scaled by complexity, n. + tol = (rounding + 2 * FLT_EPSILON * 4) * n; + } for (uint32_t i = 0; i < n; ++i) { auto res = std::complex((float32_t)result[i].re / (1 << 15), @@ -56,24 +61,13 @@ bool check_fft_results(const char *name, const armral_cmplx_int16_t *result, } } - printf("[%s] - check result: %s, max error was %.10f vs tolerance of %.10f\n", + printf("[%s] - check result: %s, max error was %.10f vs tolerance " + "of %.10f\n", name, passed ? "OK" : "ERROR", max_error, tol); return passed; } -std::vector run_fft_ref(int n, armral_fft_direction_t dir, - const armral_cmplx_int16_t *x) { - std::vector> in(n); - std::vector> out(n); - for (int i = 0; i < n; i++) { - in[i].real(x[i].re / (double)(1 << 15)); - in[i].imag(x[i].im / (double)(1 << 15)); - } - armral::utils::fft_ref(n, 1, dir, in.data(), out.data()); - return armral::utils::narrow_to_cf32(out); -} - bool check_status(const armral_status ret_status, const char *message) { if (ret_status == ARMRAL_ARGUMENT_ERROR) { // GCOVR_EXCL_START @@ -84,32 +78,100 @@ bool check_status(const armral_status ret_status, const char *message) { return true; } -bool run_fft_test(int n, armral_fft_direction_t dir) { - printf("Testing FFT n=%d dir=%d\n", n, (int)dir); - constexpr armral_cmplx_int16_t min = {-4096, -4096}; - constexpr armral_cmplx_int16_t max = {4095, 4095}; +bool run_fft_test(int n) { + printf("Testing FFT n=%d\n", n); + + // Figure out how many bits should be used for initial input values + // without running into overflow. + // We have 15 bits: subtract the number of bits needed to represent n. + // The first element of FFT output will be the sum of inputs, so we at least + // need enough bits to represent n; the difference tells us how many bits + // we can use for the input without exceeding 15 bits in the sum of n values. + const int16_t nbits = 15 - (int16_t)ceil(log2(n)); + if (nbits < 1) { + // GCOVR_EXCL_START + printf("Error! Input length %d is too big to test without running into " + "overflow issues in int16_t\n", + (int)n); + return false; + // GCOVR_EXCL_STOP + } + // 2^nbits gives us the magnitude of values to use for input + const int16_t n2 = 1 << nbits; + const int16_t nn2 = -n2; + const int16_t n2m1 = n2 - 1; + + const armral_cmplx_int16_t min = {nn2, nn2}; + const armral_cmplx_int16_t max = {n2m1, n2m1}; armral::utils::cs16_random random; const auto x = random.vector(n, min, max); auto y = random.vector(n, min, max); - const auto y_ref = run_fft_ref(n, dir, x.data()); - armral_fft_plan_t *p = nullptr; - auto plan_status = armral_fft_create_plan_cs16(&p, n, dir); - if (!check_status(plan_status, "Failed to create a plan")) { + constexpr int scalef = 1 << 15; + + // For a quick test that the forward transform did + // _something_ we check that the first element + // contains the sum of the input + armral_cmplx_f32_t sum = {0.0, 0.0}; + for (const auto &i : x) { + sum.re += (float32_t)i.re / scalef; + sum.im += (float32_t)i.im / scalef; + } + + std::vector y_ref(n); + for (int i = 0; i < n; ++i) { + y_ref[i].re = ((float32_t)x[i].re / scalef) * n; + y_ref[i].im = ((float32_t)x[i].im / scalef) * n; + } + + armral_fft_plan_t *pf = nullptr; + auto fplan_status = armral_fft_create_plan_cs16(&pf, n, ARMRAL_FFT_FORWARDS); + if (!check_status(fplan_status, "Failed to create a forwards plan")) { // GCOVR_EXCL_START return false; // GCOVR_EXCL_STOP } - auto execute_status = armral_fft_execute_cs16(p, x.data(), y.data()); - if (!check_status(execute_status, "Failed to execute plan")) { + armral_fft_plan_t *pb = nullptr; + auto bplan_status = armral_fft_create_plan_cs16(&pb, n, ARMRAL_FFT_BACKWARDS); + if (!check_status(bplan_status, "Failed to create a backwards plan")) { // GCOVR_EXCL_START return false; // GCOVR_EXCL_STOP } - auto destroy_status = armral_fft_destroy_plan_cs16(&p); - if (!check_status(destroy_status, "Failed to destroy plan")) { + auto fexecute_status = armral_fft_execute_cs16(pf, x.data(), y.data()); + if (!check_status(fexecute_status, "Failed to execute forwards plan")) { + // GCOVR_EXCL_START + return false; + // GCOVR_EXCL_STOP + } + + auto intermediate = + check_fft_results("FFT", y.data(), &sum, 1, (float32_t)2 / (1 << 15)); + if (!intermediate) { + // GCOVR_EXCL_START + printf("Error! Check of forwards transform failed\n"); + return intermediate; + // GCOVR_EXCL_STOP + } + + auto bexecute_status = armral_fft_execute_cs16(pb, y.data(), y.data()); + if (!check_status(bexecute_status, "Failed to execute backwards plan")) { + // GCOVR_EXCL_START + return false; + // GCOVR_EXCL_STOP + } + + auto fdestroy_status = armral_fft_destroy_plan_cs16(&pf); + if (!check_status(fdestroy_status, "Failed to destroy forwards plan")) { + // GCOVR_EXCL_START + return false; + // GCOVR_EXCL_STOP + } + + auto bdestroy_status = armral_fft_destroy_plan_cs16(&pb); + if (!check_status(bdestroy_status, "Failed to destroy backwards plan")) { // GCOVR_EXCL_START return false; // GCOVR_EXCL_STOP @@ -130,9 +192,7 @@ int main(int argc, char **argv) { 272, 289, 342, 361, 440, 441, 484, 529, 552, 768, 800, 1024, 1125, 1140, 1170, 1104, 2048, 2401}; for (int n : ns) { - for (auto dir : {ARMRAL_FFT_FORWARDS, ARMRAL_FFT_BACKWARDS}) { - passed &= run_fft_test(n, dir); - } + passed &= run_fft_test(n); } exit(passed ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/test/LowerPHY/FFT/FFT16_2d/main.cpp b/test/LowerPHY/FFT/FFT16_2d/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..66ad01b7c5ba72625426afd98b4b5d68d0176591 --- /dev/null +++ b/test/LowerPHY/FFT/FFT16_2d/main.cpp @@ -0,0 +1,181 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#include "armral.h" +#include "cf32_utils.hpp" +#include "cs16_utils.hpp" +#include "qint64.hpp" + +#include +#include +#include +#include + +namespace { + +float clamp_neg1_to_1(float x) { + float low = -1.0; + float high = (float32_t)((1 << 15) - 1) / (1 << 15); + return std::max(low, std::min(high, x)); +} + +bool check_fft_2d_results(const char *name, const armral_cmplx_int16_t *result, + const armral_cmplx_f32_t *expected, uint32_t n0, + uint32_t n1, float tol = -1.0) { + bool passed = true; + float max_error = 0; + + constexpr float32_t rounding = (float32_t)2 / (1 << 15); + + // If not provided, set tolerance based on numerical precision and matrix size + if (tol < 0.0) { + tol = (rounding + 2 * FLT_EPSILON * 4) * n0 * n1; + } + + for (uint32_t i = 0; i < n0 * n1; ++i) { + auto res = std::complex((float32_t)result[i].re / (1 << 15), + (float32_t)result[i].im / (1 << 15)); + auto exp = std::complex(clamp_neg1_to_1(expected[i].re), + clamp_neg1_to_1(expected[i].im)); + auto err = std::abs(res - exp); + max_error = std::max(max_error, err); + if (err > tol) { + // GCOVR_EXCL_START + passed = false; + printf("Error! [%s] result[%u]= %.10f+%.10fi and expected[%u]= " + "%.10f+%.10fi, diff: %.10f is greater than %.10f\n", + name, i, res.real(), res.imag(), i, exp.real(), exp.imag(), err, + tol); + // GCOVR_EXCL_STOP + } + } + + printf("[%s] - check result: %s, max error was %.10f vs tolerance of %.10f\n", + name, passed ? "OK" : "ERROR", max_error, tol); + + return passed; +} + +bool run_fft_2d_test(int n0, int n1) { + printf("Testing 2D FFT n0=%d, n1=%d\n", n0, n1); + + // Determine bit range to avoid overflow + const int16_t nbits = 15 - (int16_t)ceil(log2(n0 * n1)); + if (nbits < 1) { + // GCOVR_EXCL_START + printf("Error! Input size (%dx%d) is too large for int16 FFT without " + "overflow.\n", + n0, n1); + return false; + // GCOVR_EXCL_STOP + } + + // 2^nbits gives us the magnitude of values to use for input + const int16_t n2 = 1 << nbits; + const int16_t nn2 = -n2; + const int16_t n2m1 = n2 - 1; + + const armral_cmplx_int16_t min = {nn2, nn2}; + const armral_cmplx_int16_t max = {n2m1, n2m1}; + + armral::utils::cs16_random random; + std::vector x = random.vector(n0 * n1, min, max); + std::vector y(n0 * n1, min); + + constexpr int scalef = 1 << 15; + + // Compute sum of input for verification + armral_cmplx_f32_t sum = {0.0, 0.0}; + for (const auto &i : x) { + sum.re += (float32_t)i.re / scalef; + sum.im += (float32_t)i.im / scalef; + } + + std::vector y_ref(n0 * n1); + for (int i = 0; i < n0 * n1; ++i) { + y_ref[i].re = ((float32_t)x[i].re / scalef) * (n0 * n1); + y_ref[i].im = ((float32_t)x[i].im / scalef) * (n0 * n1); + } + + armral_fft_plan_t *pf = nullptr; + auto fplan_status = + armral_fft_create_2d_plan_cs16(&pf, n0, n1, ARMRAL_FFT_FORWARDS); + if (fplan_status != ARMRAL_SUCCESS) { + // GCOVR_EXCL_START + printf("Error! Failed to create 2D FFT forward plan\n"); + return false; + // GCOVR_EXCL_STOP + } + + armral_fft_plan_t *pb = nullptr; + auto bplan_status = + armral_fft_create_2d_plan_cs16(&pb, n0, n1, ARMRAL_FFT_BACKWARDS); + if (bplan_status != ARMRAL_SUCCESS) { + // GCOVR_EXCL_START + printf("Error! Failed to create 2D FFT backward plan\n"); + return false; + // GCOVR_EXCL_STOP + } + + // Execute Forward FFT + auto fexecute_status = armral_fft_execute_cs16(pf, x.data(), y.data()); + if (fexecute_status != ARMRAL_SUCCESS) { + // GCOVR_EXCL_START + printf("Error! Failed to execute 2D FFT forward plan\n"); + return false; + // GCOVR_EXCL_STOP + } + + // Check forward FFT correctness + auto intermediate = check_fft_2d_results("2D FFT", y.data(), &sum, 1, 1, + (float32_t)2 / (1 << 15) * n0 * n1); + if (!intermediate) { + // GCOVR_EXCL_START + printf("Error! Check of forward 2D transform failed\n"); + return false; + // GCOVR_EXCL_STOP + } + + // Execute Inverse FFT + auto bexecute_status = armral_fft_execute_cs16(pb, y.data(), y.data()); + if (bexecute_status != ARMRAL_SUCCESS) { + // GCOVR_EXCL_START + printf("Error! Failed to execute 2D FFT backward plan\n"); + return false; + // GCOVR_EXCL_STOP + } + + // Destroy FFT plans + armral_fft_destroy_plan_cs16(&pf); + armral_fft_destroy_plan_cs16(&pb); + + return check_fft_2d_results("2D FFT", y.data(), y_ref.data(), n0, n1); +} + +} // Anonymous namespace + +int main(int argc, char **argv) { + bool passed = true; + constexpr std::pair ns_2d[] = { + {2, 2}, {3, 3}, {4, 4}, {5, 5}, {6, 6}, {8, 8}, + {16, 16}, {32, 32}, {64, 64}, {128, 128}, {3, 5}, {7, 11}, + {9, 15}, {13, 27}, {14, 33}, {17, 31}, {19, 43}, {23, 61}, + {29, 67}, {32, 17}, {64, 19}, {128, 23}, {256, 29}, {33, 179}}; + + for (auto [n0, n1] : ns_2d) { + passed &= run_fft_2d_test(n0, n1); + } + + for (int i = 1; i < 1000; i++) { + passed &= run_fft_2d_test(2, i); + } + + for (int i = 1; i < 1000; i++) { + passed &= run_fft_2d_test(i, 2); + } + + exit(passed ? EXIT_SUCCESS : EXIT_FAILURE); +} diff --git a/test/LowerPHY/FFT/FFT32/main.cpp b/test/LowerPHY/FFT/FFT32/main.cpp index 345a4bcbc44b5100f45a39f7f2fa2805920584c6..d00db0507e27e32d60b0e324bbf98113b8a720df 100644 --- a/test/LowerPHY/FFT/FFT32/main.cpp +++ b/test/LowerPHY/FFT/FFT32/main.cpp @@ -6,7 +6,6 @@ */ #include "armral.h" #include "cf32_utils.hpp" -#include "fft_utils.hpp" #include #include @@ -19,20 +18,22 @@ namespace { bool check_fft_results(const char *name, const armral_cmplx_f32_t *result, - const armral_cmplx_f32_t *expected, uint32_t n) { + const armral_cmplx_f32_t *expected, uint32_t n, + float tol = -1.0) { bool passed = true; float max_error = 0; - // check absolute tolerance against eps scaled by the problem size (since - // error will naturally grow as problem size increases). We have a fudge - // factor set to 10, which will be updated with a more accurate count of - // operations in the future - float tol = FLT_EPSILON * (4 * n) * 10; + // If not provided, set a tight tolerance: macheps scaled by FFT complexity + if (tol < 0.0) { + // 2 transforms in fp32 involving 4 flops per complex mul scaled + // by complexity n logn. + tol = 2 * FLT_EPSILON * 4 * n * std::log2(n); + } for (uint32_t i = 0; i < n; ++i) { std::complex res = {result[i].re, result[i].im}; std::complex exp = {expected[i].re, expected[i].im}; - float err = std::abs(res - exp); + float err = std::abs(res - exp) / std::abs(exp); max_error = std::max(err, max_error); if (err > tol) { // GCOVR_EXCL_START @@ -51,14 +52,6 @@ bool check_fft_results(const char *name, const armral_cmplx_f32_t *result, return passed; } -std::vector run_fft_ref(int n, armral_fft_direction_t dir, - const armral_cmplx_f32_t *x) { - std::vector> in = armral::utils::widen_cf32(x, n); - std::vector> out(n); - armral::utils::fft_ref(n, 1, dir, in.data(), out.data()); - return armral::utils::narrow_to_cf32(out); -} - bool check_status(const armral_status ret_status, const char *message) { if (ret_status == ARMRAL_ARGUMENT_ERROR) { // GCOVR_EXCL_START @@ -69,30 +62,73 @@ bool check_status(const armral_status ret_status, const char *message) { return true; } -bool run_fft_test(int n, armral_fft_direction_t dir) { - printf("Testing FFT n=%d dir=%d\n", n, (int)dir); +bool run_fft_test(int n) { + printf("Testing FFT n=%d\n", n); armral::utils::cf32_random random; const auto x = random.vector(n); auto y = random.vector(n); - const auto y_ref = run_fft_ref(n, dir, x.data()); - armral_fft_plan_t *p = nullptr; - auto plan_status = armral_fft_create_plan_cf32(&p, n, dir); - if (!check_status(plan_status, "Failed to create a plan")) { + // For a quick test that the forward transform did + // _something_ we check that the first element + // contains the sum of the input + armral_cmplx_f32_t sum = {0.0, 0.0}; + for (const auto &i : x) { + sum.re += i.re; + sum.im += i.im; + } + + auto y_ref = x; + for (auto &i : y_ref) { + i.re *= n; + i.im *= n; + } + + armral_fft_plan_t *pf = nullptr; + auto fplan_status = armral_fft_create_plan_cf32(&pf, n, ARMRAL_FFT_FORWARDS); + if (!check_status(fplan_status, "Failed to create a forwards plan")) { // GCOVR_EXCL_START return false; // GCOVR_EXCL_STOP } - auto execute_status = armral_fft_execute_cf32(p, x.data(), y.data()); - if (!check_status(execute_status, "Failed to execute plan")) { + armral_fft_plan_t *pb = nullptr; + auto bplan_status = armral_fft_create_plan_cf32(&pb, n, ARMRAL_FFT_BACKWARDS); + if (!check_status(bplan_status, "Failed to create a backwards plan")) { // GCOVR_EXCL_START return false; // GCOVR_EXCL_STOP } - auto destroy_status = armral_fft_destroy_plan_cf32(&p); - if (!check_status(destroy_status, "Failed to destroy plan")) { + auto fexecute_status = armral_fft_execute_cf32(pf, x.data(), y.data()); + if (!check_status(fexecute_status, "Failed to execute forwards plan")) { + // GCOVR_EXCL_START + return false; + // GCOVR_EXCL_STOP + } + + auto intermediate = + check_fft_results("FFT", y.data(), &sum, 1, 4 * FLT_EPSILON * n); + if (!intermediate) { + printf("Error! Check of forwards transform failed\n"); + return intermediate; + } + + auto bexecute_status = armral_fft_execute_cf32(pb, y.data(), y.data()); + if (!check_status(bexecute_status, "Failed to execute backwards plan")) { + // GCOVR_EXCL_START + return false; + // GCOVR_EXCL_STOP + } + + auto fdestroy_status = armral_fft_destroy_plan_cf32(&pf); + if (!check_status(fdestroy_status, "Failed to destroy forwards plan")) { + // GCOVR_EXCL_START + return false; + // GCOVR_EXCL_STOP + } + + auto bdestroy_status = armral_fft_destroy_plan_cf32(&pb); + if (!check_status(bdestroy_status, "Failed to destroy backwards plan")) { // GCOVR_EXCL_START return false; // GCOVR_EXCL_STOP @@ -114,9 +150,7 @@ int main(int argc, char **argv) { 1063, 1198, 1202, 1366, 1728, 2013, 2025, 2030, 2128, 2209, 2401, 2557, 3001, 3226, 3240, 3309, 3482, 3998, 4096, 9413}; for (int n : ns) { - for (auto dir : {ARMRAL_FFT_FORWARDS, ARMRAL_FFT_BACKWARDS}) { - passed &= run_fft_test(n, dir); - } + passed &= run_fft_test(n); } exit(passed ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/test/LowerPHY/FFT/FFT32_2d/main.cpp b/test/LowerPHY/FFT/FFT32_2d/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c81c1d30cacc2039ac43e5427ead449bb64aac7a --- /dev/null +++ b/test/LowerPHY/FFT/FFT32_2d/main.cpp @@ -0,0 +1,181 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#include "armral.h" +#include "cf32_utils.hpp" + +#include +#include +#include +#include + +#ifdef ARMRAL_SEMIHOSTING +#define M_PI 3.14159265358979323846 +#endif + +namespace { + +bool check_fft_2d_results(const char *name, const armral_cmplx_f32_t *result, + const armral_cmplx_f32_t *expected, uint32_t n0, + uint32_t n1, float tol = -1.0) { + bool passed = true; + float max_error = 0; + + // Auto-determine tolerance if not provided + if (tol < 0.0) { + tol = 2 * FLT_EPSILON * 4 * n0 * n1 * std::log2(n0 * n1); + } + + for (uint32_t i = 0; i < n0 * n1; ++i) { + std::complex res = {result[i].re, result[i].im}; + std::complex exp = {expected[i].re, expected[i].im}; + float err = std::abs(res - exp) / std::abs(exp); + max_error = std::max(err, max_error); + if (err > tol) { + passed = false; + printf("Error! [%s] result[%u]= %.10f+%.10fi and expected[%u]= " + "%.10f+%.10fi, diff: %.10f is greater than %.10f\n", + name, i, res.real(), res.imag(), i, exp.real(), exp.imag(), err, + tol); + } + } + + printf("[%s] - check result: %s, max error was %.10f vs tolerance of %.10f\n", + name, passed ? "OK" : "ERROR", max_error, tol); + + return passed; +} + +bool run_fft_2d_test(int n0, int n1) { + printf("Testing 2D FFT n0=%d, n1=%d\n", n0, n1); + armral::utils::cf32_random random; + + // Generate random input + std::vector x = + random.vector(n0 * n1, {0.0F, 0.0F}, {1.0F, 1.0F}); + std::vector y(n0 * n1); + + // Compute expected sum of input (should appear at index [0,0]) + armral_cmplx_f32_t sum = {0.0, 0.0}; + for (auto &i : x) { + sum.re += i.re; + sum.im += i.im; + } + + // Expected inverse FFT output (scaled) + std::vector y_ref = x; + for (auto &i : y_ref) { + i.re *= (n0 * n1); + i.im *= (n0 * n1); + } + + // Create 2D FFT plans + armral_fft_plan_t *pf = nullptr; + auto fplan_status = + armral_fft_create_2d_plan_cf32(&pf, n0, n1, ARMRAL_FFT_FORWARDS); + if (fplan_status != ARMRAL_SUCCESS) { + printf("Error! Failed to create 2D FFT forward plan\n"); + return false; + } + + armral_fft_plan_t *pb = nullptr; + auto bplan_status = + armral_fft_create_2d_plan_cf32(&pb, n0, n1, ARMRAL_FFT_BACKWARDS); + if (bplan_status != ARMRAL_SUCCESS) { + printf("Error! Failed to create 2D FFT backward plan\n"); + return false; + } + + // Execute Forward FFT + auto fexecute_status = armral_fft_execute_cf32(pf, x.data(), y.data()); + if (fexecute_status != ARMRAL_SUCCESS) { + printf("Error! Failed to execute 2D FFT forward plan\n"); + return false; + } + + // Check forward FFT (first coefficient should match sum) + auto intermediate = check_fft_2d_results("2D FFT", y.data(), &sum, 1, 1, + 4 * FLT_EPSILON * n0 * n1); + if (!intermediate) { + printf("Error! Check of forward 2D transform failed\n"); + return false; + } + + // Execute Inverse FFT + auto bexecute_status = armral_fft_execute_cf32(pb, y.data(), y.data()); + if (bexecute_status != ARMRAL_SUCCESS) { + printf("Error! Failed to execute 2D FFT backward plan\n"); + return false; + } + + // Destroy FFT plans + armral_fft_destroy_plan_cf32(&pf); + armral_fft_destroy_plan_cf32(&pb); + + // Check inverse FFT result + return check_fft_2d_results("2D FFT", y.data(), y_ref.data(), n0, n1); +} + +} // Anonymous namespace + +int main(int argc, char **argv) { + bool passed = true; + + constexpr std::pair ns_2d[] = {// Small square matrices + {2, 2}, + {3, 3}, + {4, 4}, + {5, 5}, + {6, 6}, + // Power-of-two sizes + {8, 8}, + {16, 16}, + {32, 32}, + {64, 64}, + {128, 128}, + {256, 256}, + {512, 512}, + {1024, 1024}, + // Irregular sizes + {3, 5}, + {7, 11}, + {9, 15}, + {13, 27}, + {14, 33}, + // Prime dimensions + {23, 23}, + {17, 31}, + {19, 43}, + {23, 61}, + {29, 67}, + // Mixed prime and power-of-two + {32, 17}, + {64, 19}, + {128, 23}, + {256, 29}, + // Large irregular sizes + {150, 200}, + {300, 500}, + {512, 600}, + {768, 1024}}; + + // Test sample cases listed above + for (auto [n0, n1] : ns_2d) { + passed &= run_fft_2d_test(n0, n1); + } + + // Tests of the first dimension with the second dimension fixed + for (int i = 1; i < 5000; i++) { + passed &= run_fft_2d_test(i, 2); + } + + // Tests of the second dimension with the first dimension fixed + for (int i = 1; i < 5000; i++) { + passed &= run_fft_2d_test(2, i); + } + + exit(passed ? EXIT_SUCCESS : EXIT_FAILURE); +} diff --git a/test/UpperPHY/LDPC/Decoding/main.cpp b/test/UpperPHY/LDPC/Decoding/main.cpp index 0c494a09811ec13f3c31b740b718b4018c968f2d..b1164e030fc42a6f52e874bcd2fbdfac11327e51 100644 --- a/test/UpperPHY/LDPC/Decoding/main.cpp +++ b/test/UpperPHY/LDPC/Decoding/main.cpp @@ -100,8 +100,7 @@ bool check_decoded_message(uint32_t len, const uint8_t *orig, template bool run_ldpc_decoding_test(uint32_t its, uint32_t z, armral_ldpc_graph_t bg, - armral_ldpc_decode_options_t options, - uint32_t len_filler_bits, + uint32_t options, uint32_t len_filler_bits, LDPCDecodingFunction ldpc_decoding_under_test) { bool passed = true; @@ -120,7 +119,7 @@ bool run_ldpc_decoding_test(uint32_t its, uint32_t z, armral_ldpc_graph_t bg, auto to_encode = random.vector((buf_len_in + 7) / 8); // If we are doing CRC checking, then we need to attach CRC bits to the input - if ((options & 3) != ARMRAL_LDPC_CRC_NO) { + if (!(options & ARMRAL_LDPC_CRC_NO)) { auto info_to_encode = random.vector((buf_len_in + 7) / 8); ldpc_crc_attachment(info_to_encode.data(), len_in, len_in - 24, to_encode.data()); @@ -139,7 +138,7 @@ bool run_ldpc_decoding_test(uint32_t its, uint32_t z, armral_ldpc_graph_t bg, // Simulate filler bits removal to create test data for // 'ARMRAL_LDPC_FILLER_BITS_IMPLICIT' - if ((options & (1 << 4)) == ARMRAL_LDPC_FILLER_BITS_IMPLICIT) { + if (options & ARMRAL_LDPC_FILLER_BITS_IMPLICIT) { uint32_t len_s_f_bytes = (len_in - 2 * z + len_filler_bits) >> 3; uint32_t len_s_bytes = (len_in - 2 * z) >> 3; uint32_t len_p_bytes = @@ -202,8 +201,7 @@ bool run_all_tests(char const *name, for (auto bg : bgs) { for (uint32_t i = 0; i < zs.size(); i++) { auto z = zs[i]; - armral_ldpc_decode_options_t options = armral_ldpc_decode_options_t( - (z >= 208) ? ARMRAL_LDPC_DEFAULT_OPTIONS : 0); + uint32_t options = z >= 208 ? ARMRAL_LDPC_CRC_24B : ARMRAL_LDPC_CRC_NO; uint32_t len_filler_bits = (z >= 208) ? 32 : 0; for (auto its : num_its) { printf("[%s] z = %d, its = %d len_filler_bits = %d\n", name, z, its, @@ -231,7 +229,7 @@ int main(int argc, char **argv) { "LDPCDecodingNoAlloc", [](uint32_t n, const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z, uint32_t len_filler_bits, uint8_t *data_out, uint32_t max_its, - armral_ldpc_decode_options_t options) { + uint32_t options) { auto buffer_size = armral_ldpc_decode_block_noalloc_buffer_size(bg, z, max_its); std::vector buffer(buffer_size); diff --git a/test/UpperPHY/Polar/Frozen/main.cpp b/test/UpperPHY/Polar/Frozen/main.cpp index e42651f2b1bf8e20a13857196d5c464b89443a17..6c27fe0dcba4ddc88c904bec00c5cf3a3f214908 100644 --- a/test/UpperPHY/Polar/Frozen/main.cpp +++ b/test/UpperPHY/Polar/Frozen/main.cpp @@ -151,7 +151,7 @@ static void polar_frozen_mask_ref(uint32_t n, uint32_t e, uint32_t k, int qi_tmp_idx = n - 1; uint32_t wm_idx = ~0U; uint32_t wm_weight = ~0U; - for (uint32_t i = 0; i < k + n_pc_wm; ++i) { + for (uint32_t i = 0; i < k; ++i) { // work backwards through qi_tmp, take elements not in qf_tmp while (qf_tmp[qi_tmp[qi_tmp_idx]]) { --qi_tmp_idx; @@ -165,8 +165,17 @@ static void polar_frozen_mask_ref(uint32_t n, uint32_t e, uint32_t k, wm_weight = new_wm_weight; } } - if (n_pc_wm != 0) { + if (n_pc_wm == 1) { + // When n_pc_wm == 1, then we add one more INFO_BIT and replace an INFO_BIT + // with a PARITY_BIT at the previously most reliable index with the minimum + // weight. + while (qf_tmp[qi_tmp[qi_tmp_idx]]) { + --qi_tmp_idx; + } + uint32_t bit_idx = qi_tmp[qi_tmp_idx]; + frozen[bit_idx] = ARMRAL_POLAR_INFO_BIT; frozen[wm_idx] = ARMRAL_POLAR_PARITY_BIT; + --qi_tmp_idx; } for (uint32_t i = 0; i < n_pc - n_pc_wm; ++i) { // work backwards through qi_tmp, take elements not in qf_tmp @@ -206,7 +215,10 @@ int main(int argc, char **argv) { // test e >= n to check repetition doesn't affect the frozen mask. for (int e = k + n_pc; e <= n + 1; ++e) { passed &= run_polar_frozen_mask_test(n, e, k, n_pc, 0); - if (n_pc > 0) { + + // There is a limited domain where n_pc_wm = 1. See 6.3.1.3.1 of TS + // 38.212. + if (k >= 18 && k <= 25 && n_pc == 3 && e - k + 3 > 192) { passed &= run_polar_frozen_mask_test(n, e, k, n_pc, 1); } } diff --git a/utils/fft_utils.hpp b/utils/fft_utils.hpp deleted file mode 100644 index ce1a0538538ec3b0ba4d7417268af3296d2a8a59..0000000000000000000000000000000000000000 --- a/utils/fft_utils.hpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its - affiliates - SPDX-License-Identifier: BSD-3-Clause -*/ -#pragma once - -#include "armral.h" - -#include - -#if defined(ARMRAL_SEMIHOSTING) || !defined(_GNU_SOURCE) -#define M_PI 3.14159265358979323846 -#endif - -namespace armral::utils { - -inline void fft_ref(int n, int s, armral_fft_direction_t dir, - const std::complex *x, std::complex *y) { - using namespace std::complex_literals; - if (n % 2 == 0) { - fft_ref(n / 2, 2 * s, dir, x, y); - fft_ref(n / 2, 2 * s, dir, x + s, y + n / 2); - - for (int k = 0; k < n / 2; k++) { - auto m = 2i * (dir * M_PI * k / n); - auto p = y[k]; - auto q = y[k + n / 2] * std::exp(m); - y[k] = p + q; - y[k + n / 2] = p - q; - } - } else { - for (int i = 0; i < n; ++i) { - auto acc = 0i; - for (int j = 0; j < n; ++j) { - auto m = 2i * (dir * M_PI * i * j / n); - acc += x[j * s] * std::exp(m); - } - y[i] = acc; - } - } -} - -} // namespace armral::utils