diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 7a58224db35d9760fa64ddc9f1e35fad726c4af4..a2d482b068478b3ff15287a4f1ed2cd56441ef93 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,3 +1,4 @@ +--- default: image: ubuntu:22.04 tags: diff --git a/CHANGELOG.md b/CHANGELOG.md index 500fdf51998205dec0773de49c209a07c4c05da5..915c20fa31fc22722a6752845813dd80971c2002 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,515 +17,576 @@ documented in this file. ### Security +## [24.10] - 2024-10-17 + +### Added + +- Added the function `armral_turbo_perm_idx_init` which generates all + permutation indices used in the permutation step of LTE Turbo decoding. + +- Added the function `armral_cmplx_matmul_i16_noalloc` which multiplies two + matrices of complex Q15 values using a 64-bit Q32.31 accumulator. This + function does not call any system memory allocators, unlike the existing + `armral_cmplx_matmul_i16` function. + +### Changed + +- The interfaces for `armral_turbo_decode_block` and + `armral_turbo_decode_block_noalloc` now have an additional argument. They now + include the option to supply a user-allocated buffer which, if used, must be + initialized with permutation indices by calling + `armral_turbo_perm_idx_init`. This buffer can then be reused in subsequent + calls to the Turbo decoding functions and will improve their performance by + removing the need to compute the indices on each call. If the buffer is not + initialized and a null pointer is passed instead, the functions will recompute + the permutation indices on every call. + +- Improved performance of `armral_fft_execute_cf32` and + `armral_fft_execute_cs16`. Cases which were calculated using recursive calls + to Rader's algorithm are now calculated using Bluestein's algorithm. + +### Fixed + +- Fixed performance regressions in the SVE versions of the following routines: + + - `armral_cmplx_vecdot_f32` + - `armral_cmplx_vecmul_f32_2` + ## [24.07] - 2024-07-18 ### Added + - CMake option `ARMRAL_ENABLE_WEXTRA` to add the compiler flag `-Wextra` when -building the library and tests. + building the library and tests. ### Changed + - Documentation is now installed by the `make install` target, if it has been -built. + built. -- Improved performance of `armral_cmplx_matmul_f32`. For complex 32-bit -floating point matrix multiplication, we recommend you use this function for -all cases. This function calls existing optimized special cases with minimal -overhead and has new optimizations for larger cases. +- Improved performance of `armral_cmplx_matmul_f32`. For complex 32-bit floating + point matrix multiplication, we recommend you use this function for all + cases. This function calls existing optimized special cases with minimal + overhead and has new optimizations for larger cases. - Improved performance of `armral_turbo_decode_block` and -`armral_turbo_decode_block_noalloc`. These functions now operate internally on -16-bit integer values rather than 16-bit or 32-bit floating point values. + `armral_turbo_decode_block_noalloc`. These functions now operate internally on + 16-bit integer values rather than 16-bit or 32-bit floating point values. - The following functions now use unsigned integers in their interfaces to -represent the lengths of vectors and the dimensions of matrices: - - `armral_cmplx_vecdot_f32` - - `armral_cmplx_vecdot_f32_2` - - `armral_cmplx_vecdot_i16` - - `armral_cmplx_vecdot_i16_2` - - `armral_cmplx_vecdot_i16_32bit` - - `armral_cmplx_vecdot_i16_2_32bit` - - `armral_cmplx_vecmul_f32` - - `armral_cmplx_vecmul_f32_2` - - `armral_cmplx_vecmul_i16` - - `armral_cmplx_vecmul_i16_2` - - `armral_corr_coeff_i16` - - `armral_svd_cf32` - - `armral_svd_cf32_noalloc` - - `armral_svd_cf32_noalloc_buffer_size` + represent the lengths of vectors and the dimensions of matrices: + + - `armral_cmplx_vecdot_f32` + - `armral_cmplx_vecdot_f32_2` + - `armral_cmplx_vecdot_i16` + - `armral_cmplx_vecdot_i16_2` + - `armral_cmplx_vecdot_i16_32bit` + - `armral_cmplx_vecdot_i16_2_32bit` + - `armral_cmplx_vecmul_f32` + - `armral_cmplx_vecmul_f32_2` + - `armral_cmplx_vecmul_i16` + - `armral_cmplx_vecmul_i16_2` + - `armral_corr_coeff_i16` + - `armral_svd_cf32` + - `armral_svd_cf32_noalloc` + - `armral_svd_cf32_noalloc_buffer_size` - Renamed `armral_cmplx_mat_mult_aah_f32` to be `armral_cmplx_matmul_aah_f32`. -All arguments are in the same order and have the same meaning. + All arguments are in the same order and have the same meaning. - Replaced `armral_cmplx_mat_mult_ahb_f32` with `armral_cmplx_matmul_ahb_f32`. -Note that the meanings of the parameters `m`, `n`, and `k` differ between the -old function and the new; a call to the old function of the form + Note that the meanings of the parameters `m`, `n`, and `k` differ between the + old function and the new; a call to the old function of the form `armral_cmplx_mat_mult_ahb_f32(dim1, dim2, dim3, a, b, c);` -becomes + becomes `armral_cmplx_matmul_ahb_f32(dim2, dim3, dim1, a, b, c);` -- Replaced `armral_cmplx_mat_mult_i16` with `armral_cmplx_matmul_i16`. -Note that the meanings of the parameters `m`, `n`, and `k` differ between the -old function and the new; a call to the old function of the form +- Replaced `armral_cmplx_mat_mult_i16` with `armral_cmplx_matmul_i16`. Note + that the meanings of the parameters `m`, `n`, and `k` differ between the old + function and the new; a call to the old function of the form `armral_cmplx_mat_mult_i16(dim1, dim2, dim3, a, b, c);` -becomes + becomes `armral_cmplx_matmul_i16(dim1, dim3, dim2, a, b, c);` -- Replaced `armral_cmplx_mat_mult_i16_32bit` with `armral_cmplx_matmul_i16_32bit`. -Note that the meanings of the parameters `m`, `n`, and `k` differ between the -old function and the new; a call to the old function of the form +- Replaced `armral_cmplx_mat_mult_i16_32bit` with + `armral_cmplx_matmul_i16_32bit`. Note that the meanings of the parameters + `m`, `n`, and `k` differ between the old function and the new; a call to the + old function of the form `armral_cmplx_mat_mult_i16_32bit(dim1, dim2, dim3, a, b, c);` -becomes + becomes `armral_cmplx_matmul_i16_32bit(dim1, dim3, dim2, a, b, c);` -- Replaced `armral_cmplx_matmul_f32` with `armral_cmplx_matmul_f32`. -Note that the meanings of the parameters `m`, `n`, and `k` differ between the -old function and the new; a call to the old function of the form +- Replaced `armral_cmplx_matmul_f32` with `armral_cmplx_matmul_f32`. Note that + the meanings of the parameters `m`, `n`, and `k` differ between the old + function and the new; a call to the old function of the form `armral_cmplx_mat_mult_f32(dim1, dim2, dim3, a, b, c);` -becomes + becomes `armral_cmplx_matmul_f32(dim1, dim3, dim2, a, b, c);` ### Fixed + - Corrected documentation for `armral_cmplx_mat_inverse_batch_f32` and -`armral_cmplx_mat_inverse_batch_f32_pa` to clarify that these functions have no -restriction on batch sizes. + `armral_cmplx_mat_inverse_batch_f32_pa` to clarify that these functions have + no restriction on batch sizes. ## [24.04] - 2024-04-19 ### Added + - Makefile target `bench_excel_summary` to run the benchmarks and create an -Excel spreadsheet containing the results. + Excel spreadsheet containing the results. ### Changed + - Moved `license_terms/BSD-3-Clause.txt` and -`license_terms/third_party_licenses.txt` to -[LICENSE.md](https://gitlab.arm.com/networking/ral/-/blob/main/LICENSE.md) and -[THIRD_PARTY_LICENSES.md](https://gitlab.arm.com/networking/ral/-/blob/main/THIRD_PARTY_LICENSES.md) -respectively. + `license_terms/third_party_licenses.txt` to + [LICENSE.md](https://gitlab.arm.com/networking/ral/-/blob/main/LICENSE.md) and + [THIRD_PARTY_LICENSES.md](https://gitlab.arm.com/networking/ral/-/blob/main/THIRD_PARTY_LICENSES.md) + respectively. - Extended `armral_cmplx_pseudo_inverse_direct_f32` and -`armral_cmplx_pseudo_inverse_direct_f32_noalloc` to compute the regularized -pseudo-inverse of a single complex 32-bit matrix of size `M-by-N` for the case -where `M` and/or `N` == 1. + `armral_cmplx_pseudo_inverse_direct_f32_noalloc` to compute the regularized + pseudo-inverse of a single complex 32-bit matrix of size `M-by-N` for the case + where `M` and/or `N` == 1. - Improved SVE2 performance of `armral_turbo_decode_block` and -`armral_turbo_decode_block_noalloc`. + `armral_turbo_decode_block_noalloc`. - Improved SVE2 performance of `armral_ldpc_encode_block` and -`armral_ldpc_encode_block_noalloc`. + `armral_ldpc_encode_block_noalloc`. ## [24.01] - 2024-01-19 ### Changed + - Extended `armral_cmplx_pseudo_inverse_direct_f32` and -`armral_cmplx_pseudo_inverse_direct_f32_noalloc` to compute the regularized -pseudo-inverse of a single complex 32-bit matrix of size `M-by-N` for cases -where `M > N` in addition to the cases where `M <= N`. + `armral_cmplx_pseudo_inverse_direct_f32_noalloc` to compute the regularized + pseudo-inverse of a single complex 32-bit matrix of size `M-by-N` for cases + where `M > N` in addition to the cases where `M <= N`. - Improved performance of `armral_turbo_decode_block` and -`armral_turbo_decode_block_noalloc`. + `armral_turbo_decode_block_noalloc`. - Improved SVE2 performance of `armral_seq_generator`, for the cases when -`sequence_len` is not a multiple of 64. + `sequence_len` is not a multiple of 64. ### Fixed + - LDPC block encoding (`armral_ldpc_encode_block`), rate matching -(`armral_ldpc_rate_matching`) and rate recovery (`armral_ldpc_rate_recovery`), -and the corresponding channel simulator, now support the insertion and removal -of filler bits as described in the 3GPP Technical Specification (TS) 38.212. -From [@Suraj4g5g](https://gitlab.arm.com/Suraj4g5g). + (`armral_ldpc_rate_matching`) and rate recovery (`armral_ldpc_rate_recovery`), + and the corresponding channel simulator, now support the insertion and removal + of filler bits as described in the 3GPP Technical Specification (TS) 38.212. + From [@Suraj4g5g](https://gitlab.arm.com/Suraj4g5g). ## [23.10] - 2023-10-06 ### Changed + - Extended the `sequence_len` parameter of `armral_seq_generator` to `uint32_t`. -From [@Suraj4g5g](https://gitlab.arm.com/Suraj4g5g). + From [@Suraj4g5g](https://gitlab.arm.com/Suraj4g5g). - Added parameter `i_bil` to `armral_polar_rate_matching` and -`armral_polar_rate_recovery` to enable or disable bit interleaving. From -[@Suraj4g5g](https://gitlab.arm.com/Suraj4g5g). + `armral_polar_rate_recovery` to enable or disable bit interleaving. From + [@Suraj4g5g](https://gitlab.arm.com/Suraj4g5g). - Added parameter `nref` to `armral_ldpc_rate_matching` and -`armral_ldpc_rate_recovery` to enable the functions to be used with a soft -buffer size. From [@Suraj4g5g](https://gitlab.arm.com/Suraj4g5g). + `armral_ldpc_rate_recovery` to enable the functions to be used with a soft + buffer size. From [@Suraj4g5g](https://gitlab.arm.com/Suraj4g5g). - Added parameter nref to `armral_ldpc_rate_matching` and -`armral_ldpc_rate_recovery` to enable the functions to be used with a soft -buffer size. From [@Suraj4g5g](https://gitlab.arm.com/Suraj4g5g). + `armral_ldpc_rate_recovery` to enable the functions to be used with a soft + buffer size. From [@Suraj4g5g](https://gitlab.arm.com/Suraj4g5g). - Improved Neon performance of Polar block decoding -(`armral_polar_decode_block`) for list sizes 1, 2, 4 and 8. + (`armral_polar_decode_block`) for list sizes 1, 2, 4 and 8. - Improved Neon performance of LDPC block decoding (`armral_ldpc_decode_block` -and `armral_ldpc_decode_block_noalloc`). + and `armral_ldpc_decode_block_noalloc`). - Simulation programs are now built by default and are tested by the make check -target. + target. ## [23.07] - 2023-07-07 ### Added + - New function to compute the regularized pseudo-inverse of a single complex -32-bit floating-point matrix (`armral_cmplx_pseudo_inverse_direct_f32`). + 32-bit floating-point matrix (`armral_cmplx_pseudo_inverse_direct_f32`). - New function to compute the multiplication of a complex 32-bit floating-point -matrix with its conjugate transpose (`armral_cmplx_mat_mult_aah_f32`). + matrix with its conjugate transpose (`armral_cmplx_mat_mult_aah_f32`). - New function to compute the complex 32-bit floating-point multiplication of -the conjugate transpose of a matrix with a matrix -(`armral_cmplx_mat_mult_ahb_f32`). + the conjugate transpose of a matrix with a matrix + (`armral_cmplx_mat_mult_ahb_f32`). - Variants of existing functions which take a pre-allocated buffer rather than -performing memory allocations internally. For functions where the buffer size is -not easily calculated from the input parameters, helper functions to calculate -the required size have been provided. + performing memory allocations internally. For functions where the buffer size + is not easily calculated from the input parameters, helper functions to + calculate the required size have been provided. - Neon-optimized implementation of batched complex 32-bit floating-point -matrix-vector multiplication (`armral_cmplx_mat_vec_mult_batch_f32`). + matrix-vector multiplication (`armral_cmplx_mat_vec_mult_batch_f32`). - SVE2-optimized implementation of complex 32-bit floating-point general matrix -inverse for matrices of size `2x2`, `3x3` and `4x4` -(`armral_cmplx_mat_inverse_f32`). + inverse for matrices of size `2x2`, `3x3` and `4x4` + (`armral_cmplx_mat_inverse_f32`). ### Changed + - Improved Neon and SVE2 performance of Mu Law compression -(`armral_mu_law_compr_8bit`, `armral_mu_law_compr_9bit`, and -`armral_mu_law_compr_14bit`). + (`armral_mu_law_compr_8bit`, `armral_mu_law_compr_9bit`, and + `armral_mu_law_compr_14bit`). - Improved Neon performance of 8-bit block float compression -(`armral_block_float_compr_8bit`). + (`armral_block_float_compr_8bit`). - Improved SVE2 performance of 9-bit block scaling decompression -(`armral_block_scaling_decompr_9bit`). + (`armral_block_scaling_decompr_9bit`). - Improved SVE2 performance of 14-bit block scaling decompression -(`armral_block_scaling_decompr_14bit`). + (`armral_block_scaling_decompr_14bit`). - Improved SVE2 performance of 8-bit and 12-bit block float compression -(`armral_block_float_compr_8bit` and `armral_block_float_compr_12bit`). + (`armral_block_float_compr_8bit` and `armral_block_float_compr_12bit`). - Moved the definition of the symbol rate out of the `ebn0_to_snr` function -(`simulation/awgn/awgn.cpp`) so that it is now a parameter that gets passed in -by each of the simulation programs. + (`simulation/awgn/awgn.cpp`) so that it is now a parameter that gets passed in + by each of the simulation programs. - Updated the `convolutional_awgn` simulation program to use OpenMP -(`simulation/convolutional_awgn/convolutional_awgn.cpp`). + (`simulation/convolutional_awgn/convolutional_awgn.cpp`). - Updated simulation programs to accept a path to write graphs to, instead of -auto-generating filenames. + auto-generating filenames. - Added the maximum number of iterations to the output of the Turbo simulation -program (`simulation/turbo_awgn/turbo_error_rate.py`). + program (`simulation/turbo_awgn/turbo_error_rate.py`). - Updated formatting of labels in simulation graph legends. ### Fixed + - Removed bandwidth scaling in all simulation programs so that the maximum -spectral efficiency does not exceed the number of bits per symbol. + spectral efficiency does not exceed the number of bits per symbol. - Convolutional decoding algorithm -(`armral_tail_biting_convolutional_decode_block`) now returns correct results -for input lengths greater than 255. + (`armral_tail_biting_convolutional_decode_block`) now returns correct results + for input lengths greater than 255. - Test file for convolutional decoding (`test/ConvCoding/decoding/main.cpp`) is -updated so that the tests pass as expected for input lengths which are not a -multiple of 4. + updated so that the tests pass as expected for input lengths which are not a + multiple of 4. - Neon block float decompression functions (`armral_block_float_decompr_8bit`, -`armral_block_float_decompr_9bit`, `armral_block_float_decompr_12bit`, and -`armral_block_float_decompr_14bit`) now truncate values before storing rather -than rounding them. This means the Neon implementations of these functions now -have the same behavior as the SVE implementations. + `armral_block_float_decompr_9bit`, `armral_block_float_decompr_12bit`, and + `armral_block_float_decompr_14bit`) now truncate values before storing rather + than rounding them. This means the Neon implementations of these functions now + have the same behavior as the SVE implementations. - Neon block scaling decompression functions. -(`armral_block_scaling_decompr_8bit`, `armral_block_scaling_decompr_9bit`, and -`armral_block_scaling_decompr_14bit`) now truncate values before storing rather -than rounding them. This means the Neon implementations of these functions now -have the same behavior as the SVE implementations. + (`armral_block_scaling_decompr_8bit`, `armral_block_scaling_decompr_9bit`, and + `armral_block_scaling_decompr_14bit`) now truncate values before storing + rather than rounding them. This means the Neon implementations of these + functions now have the same behavior as the SVE implementations. ## [23.04] - 2023-04-21 ### Added + - Cyclic Redundancy Check (CRC) attachment function -(`armral_polar_crc_attachment`) for Polar codes, described in section 5.2.1 of -the 3GPP Technical Specification (TS) 38.212. + (`armral_polar_crc_attachment`) for Polar codes, described in section 5.2.1 of + the 3GPP Technical Specification (TS) 38.212. - CRC function to check the validity of the output(s) of Polar decoding -(`armral_check_crc_polar`). + (`armral_check_crc_polar`). - New simulation program `modulation_awgn` which plots the error rate versus -Eb/N0 (or signal-to-noise ratio (SNR)) of taking a hard demodulation decision -for data sent over a noisy channel with no forward error correction. + Eb/N0 (or signal-to-noise ratio (SNR)) of taking a hard demodulation decision + for data sent over a noisy channel with no forward error correction. - Added a field called `snr` to the JSON output of all simulation programs, -which stores the signal-to-noise ratio. + which stores the signal-to-noise ratio. - Added a flag called `x-unit` to all plotting scripts which allows the user to -choose whether Eb/N0 or SNR is plotted on the x-axis. + choose whether Eb/N0 or SNR is plotted on the x-axis. - Added CRC attachment and check in Polar codes simulation. ### Changed - Updated [license terms] -(https://gitlab.arm.com/networking/ral/-/blob/main/license_terms/BSD-3-Clause.txt) -to BSD-3-Clause. + (https://gitlab.arm.com/networking/ral/-/blob/main/license_terms/BSD-3-Clause.txt) + to BSD-3-Clause. - Updated Polar decoding (`armral_polar_decode_block`) to accept a list size of -8. + 8. - LDPC decoding (`armral_ldpc_decode_block`) can optionally make use of attached -CRC information to terminate iteration early in the case that a match is found. + CRC information to terminate iteration early in the case that a match is + found. - Improved Neon performance of tail biting convolutional encoder for LTE -(`armral_tail_biting_convolutional_encode_block`). + (`armral_tail_biting_convolutional_encode_block`). - Improved Neon performance of tail biting convolutional decoder for LTE -(`armral_tail_biting_convolutional_decode_block`). + (`armral_tail_biting_convolutional_decode_block`). ### Fixed + - Calculation of the encoded data length in the LDPC simulation program -(`armral/simulation/ldpc_awgn/ldpc_error_rate.py`) is updated to match that used -in Arm RAN Acceleration Library. + (`armral/simulation/ldpc_awgn/ldpc_error_rate.py`) is updated to match that + used in ArmRAL. - Graphs generated from results of simulation programs in the simulation -directory no longer plot Shannon limits and theoretical maxima versus block -error rates. Shannon limits and theoretical maxima continue to be plotted for -bit error rates. + directory no longer plot Shannon limits and theoretical maxima versus block + error rates. Shannon limits and theoretical maxima continue to be plotted for + bit error rates. ## [23.01] - 2023-01-27 ### Added + - Rate matching for Turbo coding (`armral_turbo_rate_matching`). This implements -the operations in section 5.1.4.1 of the 3GPP Technical Specification (TS) -36.212. + the operations in section 5.1.4.1 of the 3GPP Technical Specification (TS) + 36.212. - Rate recovery for Turbo coding (`armral_turbo_rate_recovery`). This implements -the inverse operations of rate matching. Rate matching is described in section -5.1.4.1 of the 3GPP Technical Specification (TS) 36.212. + the inverse operations of rate matching. Rate matching is described in section + 5.1.4.1 of the 3GPP Technical Specification (TS) 36.212. - Tail-biting convolutional encoder for LTE -(`armral_tail_biting_convolutional_encode_block`). + (`armral_tail_biting_convolutional_encode_block`). - Tail-biting convolutional decoder for LTE -(`armral_tail_biting_convolutional_decode_block`). + (`armral_tail_biting_convolutional_decode_block`). - Scrambling for Physical Uplink Control Channels (PUCCH) formats 2, 3 and 4, -Physical Downlink Shared Channel (PDSCH), Physical Downlink Control Channel -(PDCCH), and Physical Broadcast Channel (PBCH) (`armral_scramble_code_block`). -This covers scrambling as described in 3GPP Technical Specification (TS) 38.211, -sections 6.3.2.5.1, 6.3.2.6.1, 7.3.1.1, 7.3.2.3, and 7.3.3.1. + Physical Downlink Shared Channel (PDSCH), Physical Downlink Control Channel + (PDCCH), and Physical Broadcast Channel (PBCH) (`armral_scramble_code_block`). + This covers scrambling as described in 3GPP Technical Specification (TS) + 38.211, sections 6.3.2.5.1, 6.3.2.6.1, 7.3.1.1, 7.3.2.3, and 7.3.3.1. - Simulation program for LTE tail-biting convolutional coding -(`armral/simulation/convolutional_awgn`). + (`armral/simulation/convolutional_awgn`). - Python script that allows users to draw the data rates of each modulation and -compare them to the capacity of the AWGN channel -(`armral/simulation/capacity/capacity.py`). + compare them to the capacity of the AWGN channel + (`armral/simulation/capacity/capacity.py`). - SVE2-optimized implementation of complex 32-bit floating point matrix-vector -multiplication (`armral_cmplx_mat_vec_mult_f32`). + multiplication (`armral_cmplx_mat_vec_mult_f32`). - SVE2-optimized implementation of 14-bit block scaling decompression -(`armral_block_scaling_decompr_14bit`). + (`armral_block_scaling_decompr_14bit`). ### Changed + - Modified error rate Python scripts (under `armral/simulation`) to use Eb/N0 as -x-axis (instead of the SNR) and to show the Shannon limits. + x-axis (instead of the SNR) and to show the Shannon limits. - Added Turbo rate matching and recovery to the Turbo simulation program -(`armral/simulation/turbo_awgn/turbo_awgn.cpp`). + (`armral/simulation/turbo_awgn/turbo_awgn.cpp`). - Improved Neon performance of block-float decompression for 9-bit and 14-bit -block-float representations. (`armral_block_float_decompr_9bit` and -`armral_block_float_decompr_14bit`). + block-float representations. (`armral_block_float_decompr_9bit` and + `armral_block_float_decompr_14bit`). - Improved Neon performance of complex 32-bit floating point matrix-vector -multiplication (`armral_cmplx_mat_vec_mult_f32`). + multiplication (`armral_cmplx_mat_vec_mult_f32`). - Improved Neon performance of Gold sequence generator (`armral_seq_generator`). - Improved Neon performance of general matrix inversion -(`armral_cmplx_mat_inverse_f32`). + (`armral_cmplx_mat_inverse_f32`). - Improved Neon performance of batched general matrix inversion -(`armral_cmplx_mat_inverse_batch_f32`). + (`armral_cmplx_mat_inverse_batch_f32`). ### Fixed + - Documentation of the interface for Polar rate recovery -(armral_polar_rate_recovery) updated to reflect how the parameters are used in -the implementation. + (`armral_polar_rate_recovery`) updated to reflect how the parameters are used + in the implementation. ## [22.10] - 2022-10-07 ### Added + - SVE2-optimized implementations of `2x2` and `4x4` matrix multiplication -functions where in-phase and quadrature components are separated -(`armral_cmplx_mat_mult_2x2_f32_iq` and `armral_cmplx_mat_mult_4x4_f32_iq`). + functions where in-phase and quadrature components are separated + (`armral_cmplx_mat_mult_2x2_f32_iq` and `armral_cmplx_mat_mult_4x4_f32_iq`). ### Changed + - The program to evaluate the error-correction performance of Polar coding in -the presence of additive white Gaussian noise (AWGN) located in -`simulation/polar_awgn` is updated to no longer take the length of a code block -as a parameter. + the presence of additive white Gaussian noise (AWGN) located in + `simulation/polar_awgn` is updated to no longer take the length of a code + block as a parameter. - Improved the Neon and SVE2 performance of LDPC encoding for a single code -block (`armral_ldpc_encode_block`). + block (`armral_ldpc_encode_block`). - Improved the Neon performance of Turbo decoding for a single code block -(`armral_turbo_decode_block`). + (`armral_turbo_decode_block`). - Improved the Neon performance of Turbo encoding for a single code block -(`armral_turbo_encode_block`). + (`armral_turbo_encode_block`). - Improved the Neon performance of 32-bit floating point general matrix -inversion (`armral_cmplx_mat_inverse_f32`). + inversion (`armral_cmplx_mat_inverse_f32`). - Improved the Neon performance of 32-bit floating point batch general matrix -inversion (`armral_cmplx_mat_inverse_batch_f32` and -`armral_cmplx_mat_inverse_batch_f32_pa`). + inversion (`armral_cmplx_mat_inverse_batch_f32` and + `armral_cmplx_mat_inverse_batch_f32_pa`). ### Fixed + - The Turbo coding simulation program now builds when performing an SVE build of -the library. + the library. ## [22.07] - 2022-07-15 ### Added + - SVE2-optimized implementation of equalization with four subcarriers -(`armral_solve_*x*_4sc_f32`). + (`armral_solve_*x*_4sc_f32`). - Matrix-vector multiplication functions for batches of 32-bit complex -floating-point matrices and vectors (`armral_cmplx_mat_vec_mult_batch_f32` and -`armral_cmplx_mat_vec_mult_batch_f32_pa`). + floating-point matrices and vectors (`armral_cmplx_mat_vec_mult_batch_f32` and + `armral_cmplx_mat_vec_mult_batch_f32_pa`). - LTE Turbo encoding function (`armral_turbo_encode_block`) that implements the -encoding scheme defined in section 5.1.3.2 of the 3GPP Technical Specification -(TS) 36.212 "Multiplexing and channel coding". + encoding scheme defined in section 5.1.3.2 of the 3GPP Technical Specification + (TS) 36.212 "Multiplexing and channel coding". - LTE Turbo decoding function (`armral_turbo_decode_block`) that implements a -maximum a posteriori (MAP) algorithm to return a hard decision (either 0 or 1) -for each output bit. + maximum a posteriori (MAP) algorithm to return a hard decision (either 0 or 1) + for each output bit. - Functions to perform rate matching and rate recovery for Polar coding. These -implement the specification in section 5.4.1 of the 3GPP Technical Specification -(TS) 38.212. + implement the specification in section 5.4.1 of the 3GPP Technical Specification + (TS) 38.212. - Functions to perform rate matching and rate recovery for LDPC coding. This -implements the specification in section 5.4.2 of the 3GPP Technical -Specification (TS) 38.212. + implements the specification in section 5.4.2 of the 3GPP Technical + Specification (TS) 38.212. - Utilities to simulate the error correction performance for Polar, LDPC and -Turbo coding over a noisy channel. + Turbo coding over a noisy channel. ### Changed + - Renamed the Polar encoding and decoding functions to -`armral_polar_encode_block` and `armral_polar_decode_block`. + `armral_polar_encode_block` and `armral_polar_decode_block`. - Improved the Neon and SVE2 performance of 16-QAM modulation -(`armral_modulation` with `armral_modulation_type` set to `ARMRAL_MOD_16QAM)`. + (`armral_modulation` with `armral_modulation_type` set to `ARMRAL_MOD_16QAM)`. - Improved the SVE2 performance of Mu law compression and decompression -(`armral_mu_law_compr_*` and `armral_mu_law_decompr_*`). + (`armral_mu_law_compr_*` and `armral_mu_law_decompr_*`). - Improved the SVE2 performance of block float compression and decompression -(`armral_block_float_compr_*` and `armral_block_float_decompr_*`). + (`armral_block_float_compr_*` and `armral_block_float_decompr_*`). - Improved the SVE2 performance of 8-bit block scaling compression -(`armral_block_scaling_compr_8bit`). + (`armral_block_scaling_compr_8bit`). - Improved the performance of 32-bit floating-point and 16-bit fixed-point -complex valued FFTs (`armral_fft_execute_cf32` and `armral_fft_execute_cs16`) -with large prime factors. + complex valued FFTs (`armral_fft_execute_cf32` and `armral_fft_execute_cs16`) + with large prime factors. ## [22.04] - 2022-04-08 ### Added + - SVE2-optimized implementations batched 16-bit fixed-point matrix-vector -multiplication with 64-bit and 32-bit fixed-point accumulator -(`armral_cmplx_mat_vec_mult_batch_i16`, -`armral_cmplx_mat_vec_mult_batch_i16_pa`, -`armral_cmplx_mat_vec_mult_batch_i16_32bit`, -`armral_cmplx_mat_vec_mult_batch_i16_32bit_pa`). + multiplication with 64-bit and 32-bit fixed-point accumulator + (`armral_cmplx_mat_vec_mult_batch_i16`, + `armral_cmplx_mat_vec_mult_batch_i16_pa`, + `armral_cmplx_mat_vec_mult_batch_i16_32bit`, + `armral_cmplx_mat_vec_mult_batch_i16_32bit_pa`). - SVE2-optimized implementation of complex 32-bit floating-point singular value -decomposition (`armral_svd_cf32`). + decomposition (`armral_svd_cf32`). - SVE2-optimized implementations of complex 32-bit floating-point Hermitian -matrix inversion for a single matrix or a batch of matrices of size `3x3` -(`armral_cmplx_hermitian_mat_inverse_f32` and -`armral_cmplx_hermitian_mat_inverse_batch_f32`). + matrix inversion for a single matrix or a batch of matrices of size `3x3` + (`armral_cmplx_hermitian_mat_inverse_f32` and + `armral_cmplx_hermitian_mat_inverse_batch_f32`). - SVE2-optimized implementations of 9-bit and 14-bit Mu law compression -(`armral_mu_law_compr_9bit` and `armral_mu_law_compr_14bit`). + (`armral_mu_law_compr_9bit` and `armral_mu_law_compr_14bit`). - SVE2-optimized implementations of 9-bit and 14-bit Mu law decompression -(`armral_mu_law_decompr_9bit` and `armral_mu_law_decompr_14bit`). + (`armral_mu_law_decompr_9bit` and `armral_mu_law_decompr_14bit`). - Complex 32-bit floating-point general matrix inversion for matrices of size -`2x2`, `3x3`, `4x4`, `8x8`, and `16x16` (`armral_cmplx_mat_inverse_f32`). + `2x2`, `3x3`, `4x4`, `8x8`, and `16x16` (`armral_cmplx_mat_inverse_f32`). ### Changed + - Improved the performance of batched 16-bit floating-point matrix-vector -multiplication with 64-bit floating-point accumulator -(`armral_cmplx_mat_vec_mult_batch_i16` and -`armral_cmplx_mat_vec_mult_batch_i16_pa`). + multiplication with 64-bit floating-point accumulator + (`armral_cmplx_mat_vec_mult_batch_i16` and + `armral_cmplx_mat_vec_mult_batch_i16_pa`). - Improved the performance of batched 16-bit floating-point matrix-vector -multiplication with 32-bit floating-point accumulator -(`armral_cmplx_mat_vec_mult_batch_i16_32bit` and -`armral_cmplx_mat_vec_mult_batch_i16_32bit_pa`). + multiplication with 32-bit floating-point accumulator + (`armral_cmplx_mat_vec_mult_batch_i16_32bit` and + `armral_cmplx_mat_vec_mult_batch_i16_32bit_pa`). - Improved the performance of 14-bit block float compression -(`armral_block_float_compr_14bit`). + (`armral_block_float_compr_14bit`). - Improved the performance of 14-bit block scaling compression -(`armral_block_scaling_compr_14bit`). + (`armral_block_scaling_compr_14bit`). - Improved the performance of 14-bit Mu law compression -(`armral_mu_law_compr_14bit`). + (`armral_mu_law_compr_14bit`). - Improved the performance of complex 32-bit floating-point singular value -decomposition (`armral_svd_cf32`). The input matrix now needs to be stored in -column-major order. Output matrices are also returned in column-major order. + decomposition (`armral_svd_cf32`). The input matrix now needs to be stored in + column-major order. Output matrices are also returned in column-major order. - Improved the performance of complex 32-bit floating-point Hermitian matrix -inversion for a single matrix or a batch of matrices of size `3x3` -(`armral_cmplx_hermitian_mat_inverse_f32` and -`armral_cmplx_hermitian_mat_inverse_batch_f32`). + inversion for a single matrix or a batch of matrices of size `3x3` + (`armral_cmplx_hermitian_mat_inverse_f32` and + `armral_cmplx_hermitian_mat_inverse_batch_f32`). - Improved the performance of Polar list decoding (`armral_polar_decoder`) with -list size 4. The performance for list size 1 is slightly reduced, but the -list size 4 gives much better error correction. + list size 4. The performance for list size 1 is slightly reduced, but the list + size 4 gives much better error correction. - Added restrictions to the number of matrices and vectors in the batch for the -functions that perform batched matrix-vector multiplications in fixed-point -precision (`armral_cmplx_mat_vec_mult_batch_i16`, -`armral_cmplx_mat_vec_mult_batch_i16_pa`, -`armral_cmplx_mat_vec_mult_batch_i16_32bit`, -`armral_cmplx_mat_vec_mult_batch_i16_32bit_pa`). + functions that perform batched matrix-vector multiplications in fixed-point + precision (`armral_cmplx_mat_vec_mult_batch_i16`, + `armral_cmplx_mat_vec_mult_batch_i16_pa`, + `armral_cmplx_mat_vec_mult_batch_i16_32bit`, + `armral_cmplx_mat_vec_mult_batch_i16_32bit_pa`). - The function to perform fixed-point complex matrix-matrix multiplication with -a 64-bit accumulator (`armral_cmplx_mat_mult_i16`) now narrows from the 64-bit -accumulator to a 32-bit intermediate value, and then to the 16-bit result using -truncating narrowing operations instead of rounding operations. This matches the -behavior in the fixed-point complex matrix-matrix multiplication with a 32-bit -accumulator. + a 64-bit accumulator (`armral_cmplx_mat_mult_i16`) now narrows from the 64-bit + accumulator to a 32-bit intermediate value, and then to the 16-bit result + using truncating narrowing operations instead of rounding operations. This + matches the behavior in the fixed-point complex matrix-matrix multiplication + with a 32-bit accumulator. - The function to perform fixed-point complex matrix-vector multiplication with -a 64-bit accumulator (`armral_cmplx_mat_vec_mult_i16`) now narrows from the -64-bit accumulator to a 32-bit intermediate value, and then to the 16-bit result -using truncating narrowing operations instead of rounding operations. This -matches the behavior in the fixed-point complex matrix-vector multiplication -with a 32-bit accumulator. + a 64-bit accumulator (`armral_cmplx_mat_vec_mult_i16`) now narrows from the + 64-bit accumulator to a 32-bit intermediate value, and then to the 16-bit + result using truncating narrowing operations instead of rounding + operations. This matches the behavior in the fixed-point complex matrix-vector + multiplication with a 32-bit accumulator. diff --git a/CMakeLists.txt b/CMakeLists.txt index 10df7fabbeb45155a091c6c3668f687e249d62d0..1124e6031e4c8fde73fa72202a4a62168ecdef15 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.3) -project(armral VERSION 24.07) +project(armral VERSION 24.10) if(CMAKE_VERSION VERSION_GREATER 3.4) # Stop CMake from automatically adding -rdynamic to linker flags because it @@ -7,7 +7,7 @@ if(CMAKE_VERSION VERSION_GREATER 3.4) cmake_policy(SET CMP0065 NEW) endif() -# set default build type if none was specified with -DCMAKE_BUILD_TYPE=... +# Set default build type if none was specified with -DCMAKE_BUILD_TYPE=... if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) message(STATUS "Setting build type to RELEASE as none was specified.") set(CMAKE_BUILD_TYPE @@ -65,6 +65,7 @@ set(ARMRAL_LIB_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Correlation/arm_correlation.c + ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/bluestein.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c @@ -330,6 +331,8 @@ if(NOT ARMRAL_OVERRIDE_COMPILE_FLAGS) -g3 -ggdb -fno-omit-frame-pointer>) + # Disable GLIBCXX assertions to avoid introducing dependency on libstdc++ + add_definitions(-D_GLIBCXX_NO_ASSERTIONS) message(STATUS "Using compilation flags: ${ARMRAL_COMPILER_FLAGS}") else() # If the CMAKE_C_FLAGS is set, CMake already deals with putting this on the @@ -356,7 +359,7 @@ target_compile_options(armral_utils PRIVATE ${ARMRAL_ARCH_COMPILE_OPTIONS} target_link_libraries(armral_utils PRIVATE ${ARMRAL_LINKER_FLAGS}) if(ARMRAL_SEMIHOSTING) - # when semihosting we need to pass "-DARMRAL_SEMIHOSTING" as a compiler flag, + # When semihosting we need to pass "-DARMRAL_SEMIHOSTING" as a compiler flag, # so we specify the string "ARMRAL_SEMIHOSTING" rather than the CMake variable # ARMRAL_SEMIHOSTING target_compile_definitions(armral PUBLIC "ARMRAL_SEMIHOSTING") @@ -411,7 +414,7 @@ if(BUILD_TESTING) bench_excel_summary COMMAND ${BENCHMARKER_SOURCE_DIR}/bench/benchmarker.py ${CMAKE_SOURCE_DIR} - ${BENCHMARKER_BUILD_DIR} --runner ${BENCHMARKER_RUNNER} --concurrent | tee + ${BENCHMARKER_BUILD_DIR} --runner ${BENCHMARKER_RUNNER} | tee ${BENCHMARKER_BUILD_DIR}/out.json COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/python/benchmark_excel_summary.py ${BENCHMARKER_BUILD_DIR}/out.json ${JOB_POOL_CONSOLE} @@ -423,9 +426,9 @@ if(BUILD_TESTING) set(ARMRAL_LINKER_FLAGS ${ARMRAL_LINKER_FLAGS} -static) endif() - # utility function to add a test + # Utility function to add a test function(add_armral_test TEST_NAME TEST_SOURCE) - # build the actual test executable itself + # Build the actual test executable itself add_executable(${TEST_NAME} ${TEST_SOURCE}) target_link_libraries(${TEST_NAME} ${ARMRAL_TEST_LINK_LIBRARIES} ${ARMRAL_LINKER_FLAGS}) @@ -433,28 +436,35 @@ if(BUILD_TESTING) target_compile_options(${TEST_NAME} PRIVATE ${ARMRAL_COMPILER_FLAGS} ${ARMRAL_ARCH_COMPILE_OPTIONS}) - # register it as a test, set up dependencies + # Register it as a test, set up dependencies add_test(NAME ${TEST_NAME} COMMAND ${ARMRAL_TEST_RUNNER} ${CMAKE_CURRENT_BINARY_DIR}/${TEST_NAME}) + if(ARMRAL_ENABLE_ASAN) + # Avoid slow-downs in newer versions of Address Santizier + # https://github.com/llvm/llvm-project/issues/64190 + set_tests_properties( + ${TEST_NAME} PROPERTIES ENVIRONMENT + "ASAN_OPTIONS=detect_stack_use_after_return=0") + endif() add_dependencies(check ${TEST_NAME}) endfunction() - # utility function to add a benchmark + # Utility function to add a benchmark function(add_armral_bench BENCH_NAME BENCH_SOURCE) - # build the actual bench executable itself + # Build the actual bench executable itself add_executable(bench_${BENCH_NAME} ${BENCH_SOURCE}) target_link_libraries(bench_${BENCH_NAME} ${ARMRAL_TEST_LINK_LIBRARIES} ${ARMRAL_LINKER_FLAGS}) target_include_directories(bench_${BENCH_NAME} PRIVATE ${ARMRAL_TEST_INC}) target_compile_options(bench_${BENCH_NAME} PRIVATE ${ARMRAL_COMPILER_FLAGS}) - # register it as a benchmark, set up dependencies + # Register it as a benchmark, set up dependencies add_dependencies(bench bench_${BENCH_NAME}) add_dependencies(bench_concurrent bench_${BENCH_NAME}) add_dependencies(bench_excel_summary bench_${BENCH_NAME}) - # add target for running the benchmark + # Add target for running the benchmark get_filename_component(BENCH_DIR ${BENCH_SOURCE} DIRECTORY) add_custom_target( run_bench_${BENCH_NAME} @@ -553,6 +563,7 @@ if(BUILD_TESTING) test/UpperPHY/Polar/SubchannelInterleave/main.cpp) add_armral_test(turbo_decoding test/UpperPHY/Turbo/Decoding/main.cpp) add_armral_test(turbo_encoding test/UpperPHY/Turbo/Encoding/main.cpp) + add_armral_test(turbo_perm_indices test/UpperPHY/Turbo/PermIndices/main.cpp) add_armral_test(turbo_rate_matching test/UpperPHY/Turbo/RateMatching/main.cpp) add_armral_test(turbo_rate_recovery test/UpperPHY/Turbo/RateRecovery/main.cpp) add_armral_test(svd test/MatrixFactorizations/SVD/main.cpp) @@ -832,17 +843,12 @@ if(NOT CMAKE_C_COMPILER_ID STREQUAL CMAKE_CXX_COMPILER_ID) endif() set(COMP_ERR_MSG - "Compilation is only supported with GNU versions 7, 8, 9, 10, \ - 11, 12, 13, or Clang versions greater than or equal to 12.0.1. \ + "Compilation is only supported with GNU versions 7, 8, 9, 10, 11, 12, 13, 14. \ If compilation fails please use one of the supported compilers." ) if(CMAKE_C_COMPILER_ID STREQUAL "GNU") if(CMAKE_C_COMPILER_VERSION VERSION_LESS 7.1 OR CMAKE_C_COMPILER_VERSION - VERSION_GREATER 13.2) - message(WARNING ${COMP_ERR_MSG}) - endif() -elseif(CMAKE_C_COMPILER_ID STREQUAL "Clang") - if(CMAKE_C_COMPILER_VERSION VERSION_LESS 12.0.1) + VERSION_GREATER 14.2) message(WARNING ${COMP_ERR_MSG}) endif() else() diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d4b42d1c18cb629f603fa1dad80f6647bb4ed949..c5c9cbc1e7e3295d54fd24aa4ed50ab1d88ae24b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,4 +1,4 @@ -# Contributing to Arm RAN Acceleration Library (ArmRAL) +# Contributing to Arm RAN Acceleration Library Describes the requirements for contributing code to Arm RAN Acceleration Library (ArmRAL): @@ -12,10 +12,9 @@ Acceleration Library (ArmRAL): ## Licensing information -Use of Arm RAN Acceleration Library is subject to a BSD-3-Clause -license, the text of which can be found in the `LICENSE.md` file -in your product installation. We will receive inbound contributions -under the same license. +Use of ArmRAL is subject to a BSD-3-Clause license, the text of which can be +found in the `LICENSE.md` file in your product installation. We will receive +inbound contributions under the same license. ## Writing and submitting patches @@ -64,13 +63,13 @@ where: For Fast Fourier Transform (FFT) functions use: - - `cf32`: complex 32-bit floating point; - - `cs16`: complex signed 16-bit integer. + - `cf32`: complex 32-bit floating point; + - `cs16`: complex signed 16-bit integer. For all other functions use: - - `f32`: 32-bit floating point; - - `i16`: signed 16-bit integer. + - `f32`: 32-bit floating point; + - `i16`: signed 16-bit integer. - *variant* is an optional suffix to distinguish different implementations of the same *algorithm* at the same *precision*. diff --git a/Doxyfile.in b/Doxyfile.in index 06c4468ed843abcadb63d39e944a7ce124a129ef..0f95a40e8d54db53b3e80d33654497e75cb07e72 100644 --- a/Doxyfile.in +++ b/Doxyfile.in @@ -38,7 +38,7 @@ PROJECT_NAME = "Arm RAN Acceleration Library Reference Guide" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = "24.07" +PROJECT_NUMBER = "24.10" # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/README.md b/README.md index 80cf1dc7f91dd2a0b037c1569bd4cf8bb08b9c87..6c334d56f04970a084ed291ca77f3957ed106aa4 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,14 @@ -# Get started with Arm RAN Acceleration Library (ArmRAL) +# Get started with Arm RAN Acceleration Library This document describes how to build, install, run tests and benchmarks, and uninstall Arm RAN Acceleration Library (ArmRAL). -## Introducing Arm RAN Acceleration Library +## Introducing ArmRAL -Arm RAN Acceleration Library provides optimized signal processing and related -maths functions for enabling 5G Radio Access Network (RAN) deployments. It -leverages the efficient vector units available on Arm cores that support the -Armv8-a architecture to accelerate 5G NR and LTE signal processing workloads, -including: +ArmRAL provides optimized signal processing and related maths functions for +enabling 5G Radio Access Network (RAN) deployments. It leverages the efficient +vector units available on Arm cores that support the Armv8-a architecture to +accelerate 5G NR and LTE signal processing workloads, including: * Matrix and vector arithmetic, such as matrix multiplication. * Fast Fourier Transforms (FFTs). @@ -19,6 +18,15 @@ including: Check (LDPC), and Turbo. * Compression and decompression. +You can download ArmRAL from . + +ArmRAL is built as a static library and must be linked in to any executable that +needs to use the library. Users can build and modify the source code to +integrate with their components or clients. The `include` directory contains the +header files, the `src` directory contains the source code, the `test` directory +contains the testing code, the `bench` directory contains the benchmarking code, +and the `examples` directory contains the examples. + ## Before you begin * Ensure you have installed all the tools listed in the **Tools** section of the @@ -28,7 +36,7 @@ including: the PMULL extension, pmull is listed under the **Features** list given in the `/proc/cpuinfo` file. -## Build Arm RAN Acceleration Library (ArmRAL) +## Build ArmRAL 1. Configure your environment. If you have multiple compilers installed on your machine, you can set the `CC` and `CXX` environment variables to the path to @@ -46,8 +54,8 @@ including: **Note:** If you are building the SVE or SVE2 version of the library, you must compile with GCC 11.1.0 or newer. -2. Build Arm RAN Acceleration Library. Navigate to the unpacked product - directory and use the following commands: +2. Build ArmRAL. Navigate to the unpacked product directory and use the + following commands: mkdir cd @@ -60,8 +68,7 @@ including: specified directory. * `{options}` with the CMake options to use to build the library. * (Optional) `` with an installation directory name. When you - install Arm RAN Acceleration Library (see - **Install Arm RAN Acceleration Library**), the library installs to the + install ArmRAL (see **Install ArmRAL**), the library installs to the specified directory. If `` is not specified, the default is `/usr/local`. * `` with the path to the root directory of the library source. @@ -214,11 +221,10 @@ including: * `-DARMRAL_SEMIHOSTING={On|Off}` - Enable (`On`), or disable (`Off`), building Arm RAN Acceleration library - with semihosting support enabled. When semihosting support is enabled, - `--specs=rdimon.specs` is passed as an additional flag during - compilation and `-lrdimon` is added to the link line for testing and - benchmarking. + Enable (`On`), or disable (`Off`), building ArmRAL with semihosting + support enabled. When semihosting support is enabled, + `--specs=rdimon.specs` is passed as an additional flag during compilation + and `-lrdimon` is added to the link line for testing and benchmarking. **Note:** If you use `-DARMRAL_SEMIHOSTING=On` you must also use a compiler with the `aarch64-none-elf` target triple. @@ -235,9 +241,9 @@ including: Default is `On`. -## Install Arm RAN Acceleration Library (ArmRAL) +## Install ArmRAL -After you have built Arm RAN Acceleration Library, you can install the library. +After you have built ArmRAL, you can install the library. 1. Ensure you have write access for the installation directories: @@ -258,11 +264,10 @@ After you have built Arm RAN Acceleration Library, you can install the library. ## Run the tests -The Arm RAN Acceleration Library package includes tests for the available -functions in the library. +The ArmRAL package includes tests for the available functions in the library. -**Note:** To run the library tests, you must have built Arm RAN Acceleration -Library with the `-DBUILD_TESTING=On` CMake option. +**Note:** To run the library tests, you must have built ArmRAL with the + `-DBUILD_TESTING=On` CMake option. To build and run the tests, use: @@ -280,13 +285,12 @@ prefix the tests with `qemu-aarch64` using: ## Run the benchmarks -All the functions in Arm RAN Acceleration Library contain benchmarking code -that contains preset problem sizes. +All the functions in ArmRAL contain benchmarking code that contains preset +problem sizes. -**Note:** To run the benchmark tests, you must have built Arm RAN Acceleration -Library with the `-DBUILD_TESTING=On` CMake option. You must also have the -executable `perf` available on your system. This can be installed via your -package manager. +**Note:** To run the benchmark tests, you must have built ArmRAL with the +`-DBUILD_TESTING=On` CMake option. You must also have the executable `perf` +available on your system. This can be installed via your package manager. To build and run the benchmarks, use: @@ -311,8 +315,8 @@ where `` is the path to the root directory of the library source. The source for the example programs is available in the `examples` directory, found in the ArmRAL root directory. -**Note:** To compile and execute the example programs, you must have built Arm -RAN Acceleration Library with the `-DBUILD_EXAMPLES=On` CMake option. +**Note:** To compile and execute the example programs, you must have built + ArmRAL with the `-DBUILD_EXAMPLES=On` CMake option. * To both build and run the example programs, use: @@ -326,27 +330,26 @@ RAN Acceleration Library with the `-DBUILD_EXAMPLES=On` CMake option. The built binaries can be found in the `examples` subdirectory of the build directory. -More information about the examples that are available in Arm RAN Acceleration -Library, and how to use the library in general, is available in -**Use Arm RAN Acceleration Library (ArmRAL)**, see `docs/examples.md`. +More information about the examples that are available in ArmRAL, and how to use +the library in general, is available in **Use Arm RAN Acceleration Library**, +see `docs/examples.md`. ## Run the simulations -You can evaluate the quality of the error correction of the different encoding schemes -against the signal-to-noise ratio using a set of noisy channel simulation -programs. ArmRAL currently only supports zero-mean Additive White Gaussian Noise -(AWGN) channel simulation. +You can evaluate the quality of the error correction of the different encoding +schemes against the signal-to-noise ratio using a set of noisy channel +simulation programs. ArmRAL currently only supports zero-mean Additive White +Gaussian Noise (AWGN) channel simulation. -**Note:** The simulation programs do not simulate a full codec, and are -intended to be used to evaluate just the forward error correction properties of -the encoding and decoding of a single code block. We do not consider channel +**Note:** The simulation programs do not simulate a full codec, and are intended +to be used to evaluate just the forward error correction properties of the +encoding and decoding of a single code block. We do not consider channel properties. The source code for the simulations and documentation for their use -are available in the `simulation` directory, found in the ArmRAL root -directory. +are available in the `simulation` directory, found in the ArmRAL root directory. **Note:** To compile and execute the simulation programs, you must have built -Arm RAN Acceleration Library with the `-DBUILD_SIMULATION=On` CMake option. This -option is set to `On` by default. +ArmRAL with the `-DBUILD_SIMULATION=On` CMake option. This option is set to `On` +by default. The following assumes that you are running commands from the build directory. @@ -357,15 +360,14 @@ The following assumes that you are running commands from the build directory. The built binaries can be found in the `simulation` subdirectory of the build directory. -More information about the simulation programs that are available in Arm RAN -Acceleration Library is available in `simulation/README.md`. +More information about the simulation programs that are available in ArmRAL is +available in `simulation/README.md`. ## Code coverage You can generate information that describes how much of the library is used by your application, or is covered by the included tests. To collect code coverage -information, you must have built Arm RAN Acceleration Library with -`-DARMRAL_ENABLE_COVERAGE=On`. +information, you must have built ArmRAL with `-DARMRAL_ENABLE_COVERAGE=On`. An example workflow could be: @@ -394,10 +396,10 @@ file. The Arm RAN Acceleration Library Reference Guide is available online at: - https://developer.arm.com/documentation/102249/2407 + https://developer.arm.com/documentation/102249/2410 If you have Doxygen installed on your system, you can build a local HTML version -of the Arm RAN Acceleration Library documentation using CMake. +of the ArmRAL documentation using CMake. To build the documentation, run: @@ -406,9 +408,9 @@ To build the documentation, run: The HTML builds and is output to `docs/html/`. To view the documentation, open the `index.html` file in a browser. -## Uninstall Arm RAN Acceleration Library +## Uninstall ArmRAL -To uninstall Arm RAN Acceleration Library: +To uninstall ArmRAL: 1. Navigate to the library build directory (where you previously ran `make install`) 2. Run: diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 3cfec742185027fedc975b3efb84b478a295f91c..b694be5f42fff8fbed6edd385488f434b656b265 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,4 +1,4 @@ -# Arm RAN Acceleration Library 24.07 Release Notes +# Arm RAN Acceleration Library 24.10 Release Notes Non-Confidential Copyright © 2020-2024 Arm Limited (or its affiliates). All rights reserved. @@ -9,6 +9,8 @@ this document. ## Contents +These Release Notes contain the following sections: + - Release overview - Release contents - Support @@ -41,13 +43,7 @@ ArmRAL includes functions that operate on 16-bit signed integers and 16-bit and ### Release status -This is the 24.07 release of ArmRAL. - -These deliverables are being released under the terms of the agreement between -Arm and each licensee (the "Agreement"). All planned verification and -validation is complete. - -The release is suitable for volume production under the terms of the Agreement. +This is the 24.10 release of ArmRAL. ### Licensing information @@ -55,8 +51,9 @@ Use of ArmRAL is subject to a BSD-3-Clause license, the text of which can be found in the `LICENSE.md` file in your product installation. We will receive inbound contributions under the same license. -If you require a different license than BSD-3-Clause for compatibility with -your end product, please get in contact. +If you require a different license than BSD-3-Clause for compatibility with your +end product, please get in contact via including +"[ArmRAL]" in the email subject line. ## Release contents @@ -66,34 +63,32 @@ The following subsections describe: - Cloning the product's git repository from Arm's GitLab - The contents of this release -- Any changes since the previous release +- The changes since the previous release - Any known issues and limitations that exist at the time of this release ### Cloning the source repository +ArmRAL is available on +[Arm's GitLab website](https://gitlab.arm.com/networking/ral). + **To access this release, clone the following repository using HTTPS:** - git clone -b armral-24.07 https://git.gitlab.arm.com/networking/ral + git clone -b armral-24.10 https://git.gitlab.arm.com/networking/ral ### Deliverables The downloaded product includes the following deliverables: -- ArmRAL 24.07 +- ArmRAL 24.10 - Release Notes (this document) - Documentation - Product documentation is available on the - [Arm Developer website](https://developer.arm.com/documentation/102249/2407). - - **Note:** Documentation, errata and release notes might change between product - releases. For the latest documentation bundle, check the product download - page. +Product documentation is available on the +[Arm Developer website](https://developer.arm.com/documentation/102249/2410). - **Note:** Arm tests its PDFs only in Adobe Acrobat and Acrobat Reader. Arm - cannot guarantee the quality of this document when used with any other PDF - reader. A suitable PDF reader can be downloaded from - [Adobe](http://www.adobe.com). +**Note:** Documentation, errata and release notes might change between product +releases. For the latest documentation bundle, check the product download +page. ### Differences from previous release @@ -105,73 +100,23 @@ ArmRAL. This section describes new features or any technical changes to features or components in this release. -- For complex 32-bit floating point matrix multiplication, we recommend that -you use `armral_cmplx_matmul_f32` for all cases. This function calls existing -optimized special cases with minimal overhead and has new optimizations for -larger cases. - -- Renamed `armral_cmplx_mat_mult_aah_f32` to be `armral_cmplx_matmul_aah_f32`. -All arguments are in the same order and have the same meaning. - -- Replaced `armral_cmplx_mat_mult_ahb_f32` with `armral_cmplx_matmul_ahb_f32`. -Note that the meanings of the parameters `m`, `n`, and `k` differ between the -old function and the new; a call to the old function of the form - - `armral_cmplx_mat_mult_ahb_f32(dim1, dim2, dim3, a, b, c);` - -becomes - - `armral_cmplx_matmul_ahb_f32(dim2, dim3, dim1, a, b, c);` - -- Replaced `armral_cmplx_mat_mult_i16` with `armral_cmplx_matmul_i16`. -Note that the meanings of the parameters `m`, `n`, and `k` differ between the -old function and the new; a call to the old function of the form - - `armral_cmplx_mat_mult_i16(dim1, dim2, dim3, a, b, c);` +- Added the function `armral_turbo_perm_idx_init` which generates all + permutation indices used in the permutation step of LTE Turbo decoding. -becomes +- The interfaces for `armral_turbo_decode_block` and + `armral_turbo_decode_block_noalloc` now have an additional argument. They now + include the option to supply a user-allocated buffer which, if used, must be + initialized with permutation indices by calling + `armral_turbo_perm_idx_init`. This buffer can then be reused in subsequent + calls to the Turbo decoding functions and will improve their performance by + removing the need to compute the indices on each call. If the buffer is not + initialized and a null pointer is passed instead, the functions will recompute + the permutation indices on every call. - `armral_cmplx_matmul_i16(dim1, dim3, dim2, a, b, c);` - -- Replaced `armral_cmplx_mat_mult_i16_32bit` with `armral_cmplx_matmul_i16_32bit`. -Note that the meanings of the parameters `m`, `n`, and `k` differ between the -old function and the new; a call to the old function of the form - - `armral_cmplx_mat_mult_i16_32bit(dim1, dim2, dim3, a, b, c);` - -becomes - - `armral_cmplx_matmul_i16_32bit(dim1, dim3, dim2, a, b, c);` - -- Replaced `armral_cmplx_mat_mult_f32` with `armral_cmplx_matmul_f32`. -Note that the meanings of the parameters `m`, `n`, and `k` differ between the -old function and the new; a call to the old function of the form - - `armral_cmplx_mat_mult_f32(dim1, dim2, dim3, a, b, c);` - -becomes - - `armral_cmplx_matmul_f32(dim1, dim3, dim2, a, b, c);` - -- The following functions now use unsigned integers in their interfaces to -represent the lengths of vectors and the dimensions of matrices: - - `armral_cmplx_vecdot_f32` - - `armral_cmplx_vecdot_f32_2` - - `armral_cmplx_vecdot_i16` - - `armral_cmplx_vecdot_i16_2` - - `armral_cmplx_vecdot_i16_32bit` - - `armral_cmplx_vecdot_i16_2_32bit` - - `armral_cmplx_vecmul_f32` - - `armral_cmplx_vecmul_f32_2` - - `armral_cmplx_vecmul_i16` - - `armral_cmplx_vecmul_i16_2` - - `armral_corr_coeff_i16` - - `armral_svd_cf32` - - `armral_svd_cf32_noalloc` - - `armral_svd_cf32_noalloc_buffer_size` - -- Added the CMake option `ARMRAL_ENABLE_WEXTRA` to add the compiler flag -`-Wextra` when building the library and tests. +- Added the function `armral_cmplx_matmul_i16_noalloc` which multiplies two + matrices of complex Q15 values using a 64-bit Q32.31 accumulator. This + function does not call any system memory allocators, unlike the existing + `armral_cmplx_matmul_i16` function. #### Performance improvements @@ -179,15 +124,9 @@ This section describes any features or components with improved performance. - Performance improvements for the following routines: - - `armral_cmplx_matmul_f32`. For complex 32-bit floating point matrix - multiplication, we recommend that you use this function for all cases. - This function calls existing optimized special cases with minimal overhead - and has new optimizations for larger cases. - - - `armral_turbo_decode_block` and - `armral_turbo_decode_block_noalloc`. These functions now operate - internally on 16-bit integer values rather than 16-bit or 32-bit - floating point values. + - `armral_fft_execute_cf32` and `armral_fft_execute_cs16`. Cases which were + calculated using recursive calls to Rader's algorithm are now calculated + using Bluestein's algorithm. #### Changes to simulation programs @@ -201,12 +140,10 @@ channel simulation programs in this release. This section describes any known issues resolved in the current release. -- Documentation is now installed by the `make install` target, if it has been -built. +- Fixed performance regressions in the SVE versions of the following routines: -- Corrected documentation for `armral_cmplx_mat_inverse_batch_f32` and -`armral_cmplx_mat_inverse_batch_f32_pa` to clarify that these functions have no -restriction on batch sizes. + - `armral_cmplx_vecdot_f32` + - `armral_cmplx_vecmul_f32_2` ### Known limitations @@ -216,25 +153,23 @@ This section describes any known limitations of the current release. ## Support -If you have any issues with the installation, content, or use of this -release, raise a question on the -[Developer Community Forum](). +ArmRAL's release history is available on the [Arm Developer +website](https://developer.arm.com/downloads/-/arm-ran-acceleration-library/previous-releases-of-the-arm-ran-acceleration-library). ## Conventions diff --git a/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/bench.py index 94584d545b7c6fda386a3368de03b701883788b0..243427188696b3cd0658104eba56d4dbd9ef732b 100755 --- a/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/bench.py +++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/bench.py @@ -12,15 +12,18 @@ def get_path(x): return x if Path(x).is_file() else os.path.join("armral", x) exe_name = get_path("bench_matrix_mult_i16_32b") -reps = 300000 -lenArr16 = [2, 4, 8, 16] +full_reps = 300000 +lenArr = [2, 4, 8, 16, 64, 128] j = { "exe_name": exe_name, "cases": [] } -for length in lenArr16: +for length in lenArr: + combined_size = length * 3 + reps = full_reps // combined_size if combined_size > 190 else full_reps + case = { "name": "matmul_i16_32b_{}".format(length), "args": "{}".format(length), diff --git a/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/bench.py index 0be0f7a2d96272a12e0062358bd0e7f05a986093..3250872080c705265a7dee04c9477947dc277196 100755 --- a/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/bench.py +++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/bench.py @@ -12,15 +12,18 @@ def get_path(x): return x if Path(x).is_file() else os.path.join("armral", x) exe_name = get_path("bench_matrix_mult_i16_64b") -reps = 300000 -lenArr16 = [2, 4, 8, 16] +full_reps = 300000 +lenArr = [2, 4, 8, 16, 64, 128] j = { "exe_name": exe_name, "cases": [] } -for length in lenArr16: +for length in lenArr: + combined_size = length * 3 + reps = full_reps // combined_size if combined_size > 190 else full_reps + case = { "name": "matmul_i16_64b_{}".format(length), "args": "{}".format(length), diff --git a/bench/UpperPHY/Turbo/Decoding/main.cpp b/bench/UpperPHY/Turbo/Decoding/main.cpp index 4f551457a654cec975b62c4c7482dc5f324fc6d0..bcb262fe807ded9cea40d7876b7068deff7c981c 100644 --- a/bench/UpperPHY/Turbo/Decoding/main.cpp +++ b/bench/UpperPHY/Turbo/Decoding/main.cpp @@ -43,13 +43,13 @@ void run_turbo_decoding_perf(const uint32_t num_prbs, const uint32_t num_bits, armral::turbo::decode_block( sys_ptr + j * (num_bits + 4), par_ptr + j * (num_bits + 4), itl_ptr + j * (num_bits + 4), num_bits, ans_ptr + j * num_bytes, 2.F, - num_iters, allocator); + num_iters, nullptr, allocator); #else heap_allocator allocator{}; armral::turbo::decode_block( sys_ptr + j * (num_bits + 4), par_ptr + j * (num_bits + 4), itl_ptr + j * (num_bits + 4), num_bits, ans_ptr + j * num_bytes, 2.F, - num_iters, allocator); + num_iters, nullptr, allocator); #endif } } diff --git a/bench/benchmarker.py b/bench/benchmarker.py index 5a42cd30e2160ced1bdb71f371aaec613680b14d..4761a006571b01b7097612935d3f8c8347ed512e 100755 --- a/bench/benchmarker.py +++ b/bench/benchmarker.py @@ -46,7 +46,7 @@ def decode_if_bytes(in_str): def display_result(res): - if res.returncode == utils.NETFUL_EXPECTED_TIMEOUT_RETCODE: + if res.returncode == utils.ARMRAL_EXPECTED_TIMEOUT_RETCODE: print("Did not run command due to expected timeout: {}".format(res.cmd), file=sys.stderr, flush=True) return 0 elif res.returncode != 0: @@ -67,7 +67,7 @@ def display_result(res): } print(json.dumps(j), flush=True) return (0 if - res.returncode == utils.NETFUL_ALLOW_ERROR_RETCODE + res.returncode == utils.ARMRAL_ALLOW_ERROR_RETCODE else res.returncode) # add begin/duration times in microseconds into existing json structure diff --git a/bench/benchmarker_utils.py b/bench/benchmarker_utils.py index 370395d98985da5bdfdc9004c075436d35c374a3..797890e0c179853b063c4d032c3aacaa9d2004db 100755 --- a/bench/benchmarker_utils.py +++ b/bench/benchmarker_utils.py @@ -8,8 +8,8 @@ import subprocess ShellResult = collections.namedtuple("ShellResult", "returncode, stdout, stderr") SECONDS_TO_MICROS = 1000000 -NETFUL_EXPECTED_TIMEOUT_RETCODE = 3 -NETFUL_ALLOW_ERROR_RETCODE = 4 +ARMRAL_EXPECTED_TIMEOUT_RETCODE = 3 +ARMRAL_ALLOW_ERROR_RETCODE = 4 def shell(cmd, check=True, **kwargs): diff --git a/docs/examples.md b/docs/examples.md index 2ce0c48c7c535c4faa961dc993fac1f6734feadb..d99815d8030fe4bac20224b3dddd6c7ce4acd8f3 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -1,4 +1,4 @@ -# Use Arm RAN Acceleration Library (ArmRAL) +# Use Arm RAN Acceleration Library This topic describes how to compile and link your application code to Arm RAN Acceleration Library (ArmRAL). @@ -12,25 +12,25 @@ Acceleration Library (ArmRAL). installed on your machine, you can set the `CC` and `CXX` environment variables to the path to the C compiler and C++ compiler that you want to use. -* You must build Arm RAN Acceleration Library before you can use it in your - application development, or to run the example programs. +* You must build ArmRAL before you can use it in your application development, + or to run the example programs. To build the library, use: - git clone -b armral-24.07 https://git.gitlab.arm.com/networking/ral.git + git clone -b armral-24.10 https://git.gitlab.arm.com/networking/ral.git mkdir ral/build cd ral/build cmake .. make -j -* To use the Arm RAN Acceleration Library functions in your application - development, include the `armral.h` header file in your C or C++ source code. +* To use the ArmRAL functions in your application development, include the + `armral.h` header file in your C or C++ source code. #include "armral.h" ## Procedure -1. Build and link your program with Arm RAN Acceleration Library. For GCC, use: +1. Build and link your program with ArmRAL. For GCC, use: gcc -c -o .o .c -I /include -O2 gcc -o .o /libarmral.a -lm @@ -39,11 +39,10 @@ Acceleration Library (ArmRAL). * `` with the name of your own source code file - * `` with the path to your copy of the Arm RAN - Acceleration Library source code + * `` with the path to your copy of the ArmRAL source + code - * `` with the path to your build of Arm RAN - Acceleration Library, as appropriate + * `` with the path to your build of ArmRAL 2. Run your binary: @@ -51,8 +50,8 @@ Acceleration Library (ArmRAL). ## Example: Run 'fft_cf32_example.c' -In this example, we use Arm RAN Acceleration Library to compute and -solve a simple Fast Fourier Transform (FFT) problem. +In this example, we use ArmRAL to compute and solve a simple Fast Fourier +Transform (FFT) problem. The following source file can be found in the ArmRAL source directory under `examples/fft_cf32_example.c`: @@ -66,15 +65,14 @@ The following source file can be found in the ArmRAL source directory under Substituting: - * `` with the path to your copy of the Arm RAN - Acceleration Library source code + * `` with the path to your copy of the ArmRAL source + code - * `` with the path to your build of Arm RAN - Acceleration Library, as appropriate + * `` with the path to your build of ArmRAL **Note:** For this example, there is a requirement to link against libm - (`-lm`). libm is used in several functions in Arm RAN Acceleration Library, - and so might be required for your own programs. + (`-lm`). libm is used in several functions in ArmRAL, and so might be + required for your own programs. An executable called `fft_cf32_example` is built. @@ -105,8 +103,8 @@ The following source file can be found in the ArmRAL source directory under ## Other examples: block-float, modulation, and polar examples -Arm RAN Acceleration Library also includes block-float, modulation, and polar -examples. These example files can also be found in the `/examples/` directory. +ArmRAL also includes block-float, modulation, and polar examples. These example +files can also be found in the `/examples/` directory. In addition to the `fft_cf32_example.c` FFT example, the following examples are included: diff --git a/docs/frontmatter.md b/docs/frontmatter.md index db88d5d92f11da34df98c6edd33e90f63eb6a9d0..98a6ac20bcd4f4303da28cc03134132a57b08da6 100644 --- a/docs/frontmatter.md +++ b/docs/frontmatter.md @@ -7,31 +7,6 @@ Copyright © 2020-2024 Arm Limited (or its affiliates). All rights reserved. This book contains reference documentation for Arm RAN Acceleration Library (ArmRAL). The book was generated from the source code using Doxygen. -Arm RAN Acceleration Library provides optimized signal processing and related -maths functions for enabling 5G Radio Access Network (RAN) deployments. It -leverages the efficient vector units available on Arm cores that support the -Armv8-a architecture to accelerate 5G NR and LTE signal processing workloads, -including: - -* Matrix and vector arithmetic, such as matrix multiplication. -* Fast Fourier Transforms (FFTs). -* Digital modulation and demodulation. -* Cyclic Redundancy Check (CRC). -* Encoding and decoding schemes, including Polar, Low-Density Parity - Check (LDPC), and Turbo. -* Compression and decompression. - -You can download Arm RAN Acceleration Library for free from -. - -Arm RAN Acceleration Library is built as a static library, and must be linked in -to any executable that needs to use the library. The source code can be -built and modified by users to integrate with their components or clients. -Header files are located in the `include` directory, the source code is located -in the `src` directory, testing code is located in the `test` directory, -benchmarking code is located in the `bench` directory, and examples are -located in the `examples` directory. - ## Feedback ### Feedback on this product @@ -49,7 +24,7 @@ supplier and give: If you have any comments on content, send an e-mail to errata@arm.com. Give: * The title Arm RAN Acceleration Library Reference Guide. -* The number 102249_2407_00_en. +* The number 102249_2410_00_en. * If applicable, the relevant page number(s) to which your comments refer. * A concise explanation of your comments. @@ -161,3 +136,4 @@ Issue | Date | Confidentiality | Change 2401-00 | 19 January 2024 | Non-Confidential | Update for Arm RAN Acceleration Library v24.01 2404-00 | 19 April 2024 | Non-Confidential | Update for Arm RAN Acceleration Library v24.04 2407-00 | 18 July 2024 | Non-Confidential | Update for Arm RAN Acceleration Library v24.07 +2410-00 | 17 October 2024 | Non-Confidential | Update for Arm RAN Acceleration Library v24.10 diff --git a/include/armral.h b/include/armral.h index d47a5b8e8c8ca8bd176ba6b6625ccaeba83d6126..c495fdec8a169ce30d2a7b2fa9da04a7533a8828 100644 --- a/include/armral.h +++ b/include/armral.h @@ -1011,6 +1011,40 @@ armral_status armral_cmplx_matmul_i16(uint16_t m, uint16_t n, uint16_t k, const armral_cmplx_int16_t *p_src_b, armral_cmplx_int16_t *p_dst); +/** + * This function performs the multiplication `C = A B` for matrices, and assumes + * that: + * + Matrix elements are complex int16 in Q15 format. + * + Matrices are stored in memory in row-major order. + * + * A 64-bit Q32.31 accumulator is used internally. If you do not need such a + * large range, consider using \link armral_cmplx_matmul_i16_32bit \endlink + * instead. To get the final result in Q15 and to avoid overflow, the + * accumulator narrows to 16 bits with saturation. + * + * This function takes a pre-allocated buffer (`buffer`) to use internally. + * This variant will not call any system memory allocators. + * + * \note + * - The buffer must be at least `k * n * sizeof(armral_cmplx_int16_t)` bytes. + * + * @param[in] m The number of rows (`M`) in matrices `A` and `C`. + * @param[in] n The number of columns (`N`) in matrices `B` and + * `C`. + * @param[in] k The number of columns (`K`) in matrix `A` and the + * number of rows in matrix `B`. + * @param[in] p_src_a Points to the first input matrix `A`. + * @param[in] p_src_b Points to the second input matrix `B`. + * @param[out] p_dst Points to the output matrix `C`. + * @param[in] buffer Workspace buffer to be used internally. + * @return An `armral_status` value that indicates success or failure. + */ +armral_status +armral_cmplx_matmul_i16_noalloc(uint16_t m, uint16_t n, uint16_t k, + const armral_cmplx_int16_t *p_src_a, + const armral_cmplx_int16_t *p_src_b, + armral_cmplx_int16_t *p_dst, void *buffer); + /** * This function performs the multiplication `C = A B` for matrices, and assumes * that: @@ -3885,6 +3919,28 @@ armral_status armral_ldpc_rate_recovery_noalloc( * (either 0 or 1) for each output bit. The encoding and decoding are performed * for a single code block. */ +/** + * This function generates all permutation indices used in the permutation step + * of the LTE Turbo decoding. This function may be used to generate the indices + * before calling the decoding functions \link armral_turbo_decode_block + * \endlink and \link armral_turbo_decode_block_noalloc \endlink, which accept + * the generated buffer as input. + * + * This function generates permutation indices for all permitted lengths of the + * input data passed to the decoder. After generating the indices, the same + * buffer may be reused independent of any changes to the input data, including + * the length `k`. + * + * This function takes a pre-allocated buffer (`buffer`) to fill with + * permutation indices. + * + * The buffer must be at least `1065744 * sizeof(uint16_t)` bytes. + * + * @param[out] buffer A pre-allocated buffer to hold the permutation indices. + * @return An `armral_status` value that indicates success or failure. + * + */ +armral_status armral_turbo_perm_idx_init(uint16_t *buffer); /** * This function implements the LTE Turbo encoding scheme described in 3GPP * Technical Specification (TS) 36.212 "Multiplexing and channel coding". It @@ -3965,10 +4021,19 @@ armral_status armral_turbo_encode_block_noalloc(const uint8_t *src, uint32_t k, * to store `k` bits. These are hard outputs (that is, either 0 or 1); the * function does not return LLRs. * - * The function takes a parameter `max_iter`, which specifies the - * maximum number of iterations that the decoder will perform. The - * algorithm will terminate in fewer iterations if there is no change - * in the computed LLRs between consecutive iterations. + * The function takes a parameter `max_iter`, which specifies the maximum number + * of iterations that the decoder will perform. The algorithm will terminate in + * fewer iterations if there is no change in the computed LLRs between + * consecutive iterations. + * + * \note + * The function is called in one of two ways: + * - `perm_idxs` is populated by calling `armral_turbo_perm_idx_init` before the + * first call to `armral_turbo_decode_block`. This initialization only happens + * once and the resulting permutation array can be reused in multiple calls to + * `armral_turbo_decode_block`. + * - `perm_idxs` is NULL. In this case `armral_turbo_decode_block` will generate + * the permutation indices during each call. * * @param[in] sys The systematic portion of the input of length `k + 4` * bytes representing 8-bit log-likelihood ratios. @@ -3979,11 +4044,15 @@ armral_status armral_turbo_encode_block_noalloc(const uint8_t *src, uint32_t k, * @param[in] k Length of the output code block in bits. * @param[out] dst Decoded output data of length `k` bits. * @param[in] max_iter Maximum number of decoding iterations to perform. + * @param[in] perm_idxs Either a buffer containing the permutation indices + * generated by an earlier call to + * \link armral_turbo_perm_idx_init \endlink, or NULL. * @return An `armral_status` value that indicates success or failure. */ armral_status armral_turbo_decode_block(const int8_t *sys, const int8_t *par, const int8_t *itl, uint32_t k, - uint8_t *dst, uint32_t max_iter); + uint8_t *dst, uint32_t max_iter, + uint16_t *perm_idxs); /** * Non-allocating variant of \link armral_turbo_decode_block \endlink. @@ -4013,6 +4082,15 @@ armral_status armral_turbo_decode_block(const int8_t *sys, const int8_t *par, * calling \link armral_turbo_decode_block_noalloc_buffer_size \endlink * with identical inputs. * + * \note + * The function is called in one of two ways: + * - `perm_idxs` is populated by calling `armral_turbo_perm_idx_init` before the + * first call to `armral_turbo_decode_block`. This initialization only happens + * once and the resulting permutation array can be reused in multiple calls to + * `armral_turbo_decode_block`. + * - `perm_idxs` is NULL. In this case `armral_turbo_decode_block` will generate + * the permutation indices during each call. + * * @param[in] sys The systematic portion of the input of length `k + 4` * bytes representing 8-bit log-likelihood ratios. * @param[in] par The parity portion of the input of length `k + 4` bytes @@ -4022,14 +4100,15 @@ armral_status armral_turbo_decode_block(const int8_t *sys, const int8_t *par, * @param[in] k Length of the output code block in bits. * @param[out] dst Decoded output data of length `k` bits. * @param[in] max_iter Maximum number of decoding iterations to perform. + * @param[in] perm_idxs Either a buffer containing the permutation indices + * generated by an earlier call to + * \link armral_turbo_perm_idx_init \endlink, or NULL. * @param[in] buffer Workspace buffer to be used internally. * @return An `armral_status` value that indicates success or failure. */ -armral_status armral_turbo_decode_block_noalloc(const int8_t *sys, - const int8_t *par, - const int8_t *itl, uint32_t k, - uint8_t *dst, uint32_t max_iter, - void *buffer); +armral_status armral_turbo_decode_block_noalloc( + const int8_t *sys, const int8_t *par, const int8_t *itl, uint32_t k, + uint8_t *dst, uint32_t max_iter, uint16_t *perm_idxs, void *buffer); /** * Calculates the required buffer size in bytes required to perform Turbo diff --git a/simulation/CMakeLists.txt b/simulation/CMakeLists.txt index 89eaf50bb84e77fdfb24a509c501656e7b4486cc..7c610ebfff05f7cd2c5d071fd8f2dfd31264318e 100644 --- a/simulation/CMakeLists.txt +++ b/simulation/CMakeLists.txt @@ -81,6 +81,13 @@ if(Threads_FOUND) add_test(NAME ${SIM_NAME} COMMAND ${CMAKE_CURRENT_BINARY_DIR}/${SIM_NAME} ${SIM_CMD_LINE_OPTS}) set_tests_properties(${SIM_NAME} PROPERTIES TIMEOUT 3000) + if(ARMRAL_ENABLE_ASAN) + # Avoid slow-downs in newer versions of Address Santizier + # https://github.com/llvm/llvm-project/issues/64190 + set_tests_properties( + ${SIM_NAME} PROPERTIES ENVIRONMENT + "ASAN_OPTIONS=detect_stack_use_after_return=0") + endif() add_dependencies(check ${SIM_NAME}) endif() diff --git a/simulation/README.md b/simulation/README.md index 8453b2a5f5a706b6950241aba77a465c1e66ff57..9d17dad5b4acfcecaa5de55c60d5696ec75e2fe8 100644 --- a/simulation/README.md +++ b/simulation/README.md @@ -1,14 +1,14 @@ -# Get started with ArmRAL noisy channel simulation +# Get started with Arm RAN Acceleration Library noisy channel simulation ## Introduction This directory contains utilities and programs that you can use to evaluate the error-correction performance of the coding schemes provided in Arm RAN -Acceleration Library (ArmRAL). ArmRAL supports three different coding schemes: -Polar, Turbo, and Low-Density Parity Check (LDPC) codes. In the presence of -noise on a channel, it is expected that some messages may not be decoded -perfectly. In the utilities provided we consider that noise on a channel is -zero-mean Additive White Gaussian Noise (AWGN). +Acceleration Library (ArmRAL). ArmRAL supports four different coding schemes: +Polar, Turbo, Low-Density Parity Check (LDPC), and tail biting convolutional +codes. In the presence of noise on a channel, it is expected that some messages +may not be decoded perfectly. In the utilities provided we consider that noise +on a channel is zero-mean Additive White Gaussian Noise (AWGN). The remainder of this document is structured as follows. To start with you will find a mathematical description of the AWGN which is simulated. The definition @@ -63,8 +63,9 @@ simulation program. The simulation programs follow the description of coding and modulation schemes provided in 3GPP Technical Specification (TS) 36.12, Section 5.1.3 (for Turbo -coding) and 3GPP TS 38.212, Section 5.3 (for Low-Density Parity Check (LDPC) -and Polar coding). We make the following further assumptions: +coding) and 3GPP TS 38.212, Section 5.1 (for tail biting convolutional coding) +and Section 5.3 (for Low-Density Parity Check (LDPC) and Polar coding). We make +the following further assumptions: 1. There is no distinction of Uplink/Downlink when it comes to selecting the values for the parameters. @@ -266,7 +267,7 @@ The JSON record contains the following fields: "ber": } -### Tail-biting Convolutional Codes +### Tail Biting Convolutional Codes You can run the `convolutional` coding Additive White Gaussian Noise (AWGN) simulation with the following parameters: diff --git a/simulation/turbo_awgn/turbo_awgn.cpp b/simulation/turbo_awgn/turbo_awgn.cpp index 779c0173098b9751915d3daf2d0ddc52577aa666..230427a17b0ac7d288d0ce52a0df7f30e3c2350d 100644 --- a/simulation/turbo_awgn/turbo_awgn.cpp +++ b/simulation/turbo_awgn/turbo_awgn.cpp @@ -129,16 +129,18 @@ struct turbo_example_data { int8_t *par_recovered; // the recovered parity values, stored as LLRs int8_t *itl_recovered; // the recovered interleaved parity values, stored uint8_t *data_decoded; // the decoded data, one bit per input bit - uint8_t *data_decoded_bytes; // the decoded data, one byte per input bit + uint8_t *data_decoded_bytes; // the decoded data, one byte per input bit + uint16_t *permutation_indices; // buffer to hold all permutation indices turbo_example_data(uint32_t k, armral_modulation_type mod, uint32_t e, - uint32_t r) { + uint32_t r, uint16_t *perm_idxs) { mod_type = mod; len_in = k; len_encoded = k + 4; len_matched = e; rv = r; len_out = k; + permutation_indices = perm_idxs; data_in = SNEW(uint8_t, (len_in + 7) / 8); data_in_bytes = SNEW(uint8_t, k); sys_encoded = SNEW(uint8_t, (len_encoded + 7) / 8); @@ -227,9 +229,9 @@ int run_check(armral::utils::random_state *state, double snr_db, uint32_t ulp, data->par_recovered, data->itl_recovered); // Run turbo decoding for a single block. - armral_turbo_decode_block(data->sys_recovered, data->par_recovered, - data->itl_recovered, data->len_out, - data->data_decoded, iter_max); + armral_turbo_decode_block( + data->sys_recovered, data->par_recovered, data->itl_recovered, + data->len_out, data->data_decoded, iter_max, data->permutation_indices); // To make it easier to compare the values, convert the bit array to a byte // array @@ -280,7 +282,8 @@ struct sim_result { }; bool run_snr(uint32_t k, uint32_t iter_max, armral_modulation_type mod_type, - uint32_t e, uint32_t rv, uint16_t ulp, double ebn0_db) { + uint32_t e, uint32_t rv, uint16_t ulp, double ebn0_db, + uint16_t *perm_idxs) { // Compute SNR in dB // The coding rate is the ratio of input information bits, k, to the number of // rate-matched bits, e. @@ -303,7 +306,7 @@ bool run_snr(uint32_t k, uint32_t iter_max, armral_modulation_type mod_type, uint64_t nr = 1e4; #pragma omp parallel reduction(+ : nb, num_message_errors) { - turbo_example_data data(k, mod_type, e, rv); + turbo_example_data data(k, mod_type, e, rv, perm_idxs); #pragma omp for for (uint64_t r = 0; r < nr; ++r) { auto state = armral::utils::random_state::from_seeds({r, nr_total}); @@ -414,9 +417,14 @@ int main(int argc, char **argv) { ulp = 128; } - for (double snr = -2; run_snr(k, iter_max, mod_type, e, rv, ulp, snr); - snr += 0.5) { + // Setup permutation indices for Turbo encoding/decoding + uint16_t *perm_idxs = SNEW(uint16_t, 355248 * 3); // sum of all valid ks * 3 + armral_turbo_perm_idx_init(perm_idxs); + + for (double snr = -2; + run_snr(k, iter_max, mod_type, e, rv, ulp, snr, perm_idxs); snr += 0.5) { } + free(perm_idxs); return 0; } diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_f32.cpp b/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_f32.cpp index 3c38b69cf8d4f05263cb17eb251ffd4285038b45..996670f4601ed3e941a145456d7d1a440667c87e 100644 --- a/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_f32.cpp +++ b/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_f32.cpp @@ -194,7 +194,6 @@ cmplx_matmul_f32(const uint16_t m, const uint16_t n, const uint16_t k, float32x4_t b_col_im; float32x4_t b_col_real2; float32x4_t b_col_im2; - float32x2_t accum = vdup_n_f32(0); const float32_t *p_in1_b = (const float32_t *)p_src_a; const float32_t *p_in1_b2 = (const float32_t *)p_src_b; @@ -269,8 +268,6 @@ cmplx_matmul_f32(const uint16_t m, const uint16_t n, const uint16_t k, // Matrix multiplication while (col_cnt > 0U) { - float32x4_t temp_r = {}; - float32x4_t temp_i = {}; // Load & separate real/imag pSrcA (de-interleave 2) a0_v = vld2q_f32(p_in1); // Load & separate real/imag pSrcA (de-interleave 2) @@ -293,8 +290,8 @@ cmplx_matmul_f32(const uint16_t m, const uint16_t n, const uint16_t k, b_col_im2 = vtrn2q_f32(b2_v, b3_v); // odd elem // First column - temp_r = vzip1q_f32x2(b_col_real, b_col_real2); - temp_i = vzip1q_f32x2(b_col_im, b_col_im2); + float32x4_t temp_r = vzip1q_f32x2(b_col_real, b_col_real2); + float32x4_t temp_i = vzip1q_f32x2(b_col_im, b_col_im2); // Second column temp_r2 = vzip2q_f32x2(b_col_real, b_col_real2); @@ -561,21 +558,14 @@ cmplx_matmul_f32(const uint16_t m, const uint16_t n, const uint16_t k, if constexpr (accumulate) { (*px).re += sum_real1; (*px).im += sum_imag1; - px++; (*px_b).re += sum_real1_b; (*px_b).im += sum_imag1_b; - px_b++; } else { (*px).re = sum_real1; (*px).im = sum_imag1; - px++; (*px_b).re = sum_real1_b; (*px_b).im = sum_imag1_b; - px_b++; } - // Update the pointer pIn2 to point to the starting address of the next - // column - j++; } // Update the pointer pInA to point to the starting address of the next 2 @@ -653,7 +643,8 @@ cmplx_matmul_f32(const uint16_t m, const uint16_t n, const uint16_t k, col_cnt--; } - accum = vpadd_f32(vget_low_f32(acc_r0), vget_high_f32(acc_r0)); + float32x2_t accum = + vpadd_f32(vget_low_f32(acc_r0), vget_high_f32(acc_r0)); sum_real1 += accum[0] + accum[1]; accum = vpadd_f32(vget_low_f32(acc_i0), vget_high_f32(acc_i0)); diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16.cpp b/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16.cpp index 26c11d3b0355e7adfe31ab726fb38e45abdbb328..3710ca02a4db28af80d83079f78caf6926f60ac9 100644 --- a/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16.cpp +++ b/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16.cpp @@ -10,5 +10,16 @@ armral_cmplx_matmul_i16(const uint16_t m, const uint16_t n, const uint16_t k, const armral_cmplx_int16_t *__restrict p_src_a, const armral_cmplx_int16_t *__restrict p_src_b, armral_cmplx_int16_t *p_dst) { - return cmplx_matmul_i16(m, n, k, p_src_a, p_src_b, p_dst); + heap_allocator allocator{}; + return cmplx_matmul_i16(m, n, k, p_src_a, p_src_b, p_dst, allocator); +} + +armral_status +armral_cmplx_matmul_i16_noalloc(const uint16_t m, const uint16_t n, + const uint16_t k, + const armral_cmplx_int16_t *__restrict p_src_a, + const armral_cmplx_int16_t *__restrict p_src_b, + armral_cmplx_int16_t *p_dst, void *buffer) { + buffer_bump_allocator allocator{buffer}; + return cmplx_matmul_i16(m, n, k, p_src_a, p_src_b, p_dst, allocator); } diff --git a/src/BasicMathFun/MatrixMult/cmplx_matmul_i16.hpp b/src/BasicMathFun/MatrixMult/cmplx_matmul_i16.hpp index ebabc55ff48300eb12d7175fa4a7a38ed3b3de08..9b8b08fd0e4d8bf64d5eb290e327e3c2fb22e2c8 100644 --- a/src/BasicMathFun/MatrixMult/cmplx_matmul_i16.hpp +++ b/src/BasicMathFun/MatrixMult/cmplx_matmul_i16.hpp @@ -6,9 +6,211 @@ #include "armral.h" #include "intrinsics.h" +#include "utils/allocators.hpp" + +#if ARMRAL_ARCH_SVE >= 2 +#include +#endif namespace { +#if ARMRAL_ARCH_SVE >= 2 +template +inline void compute_blocked_row(svint64_t *acc[], int16_t acc_base_index, + svint16_t axi, svint16_t bi0, svint16_t bi1, + svint16_t bi2, svint16_t bi3) { + *acc[acc_base_index] = svcdot_s64(*acc[acc_base_index], axi, bi0, 0); // re + *acc[acc_base_index + 1] = + svcdot_s64(*acc[acc_base_index + 1], axi, bi0, 90); // im + acc_base_index += 2; + if constexpr (rows_b > 1) { + *acc[acc_base_index] = svcdot_s64(*acc[acc_base_index], axi, bi1, 0); // re + *acc[acc_base_index + 1] = + svcdot_s64(*acc[acc_base_index + 1], axi, bi1, 90); // im + acc_base_index += 2; + if constexpr (rows_b > 2) { + *acc[acc_base_index] = + svcdot_s64(*acc[acc_base_index], axi, bi2, 0); // re + *acc[acc_base_index + 1] = + svcdot_s64(*acc[acc_base_index + 1], axi, bi2, 90); // im + acc_base_index += 2; + if constexpr (rows_b > 3) { + *acc[acc_base_index] = + svcdot_s64(*acc[acc_base_index], axi, bi3, 0); // re + *acc[acc_base_index + 1] = + svcdot_s64(*acc[acc_base_index + 1], axi, bi3, 90); // im + acc_base_index += 2; + } + } + } +} + +template +inline void compute_full_row(svbool_t prow, + const armral_cmplx_int16_t *__restrict p_src_a, + const armral_cmplx_int16_t *__restrict p_src_b, + const uint16_t k, svint64_t *acc[]) { + // clang-format off + svint16_t a0i = svdup_s16(0); svint16_t a1i = svdup_s16(0); svint16_t a2i = svdup_s16(0); svint16_t a3i = svdup_s16(0); + svint16_t bi0 = svdup_s16(0); svint16_t bi1 = svdup_s16(0); svint16_t bi2 = svdup_s16(0); svint16_t bi3 = svdup_s16(0); + // clang-format on + + bi0 = svld1_s16(prow, (const int16_t *)&p_src_b[0]); + if constexpr (rows_b > 1) { + bi1 = svld1_s16(prow, (const int16_t *)&p_src_b[k]); + } + if constexpr (rows_b > 2) { + bi2 = svld1_s16(prow, (const int16_t *)&p_src_b[2 * k]); + } + if constexpr (rows_b > 3) { + bi3 = svld1_s16(prow, (const int16_t *)&p_src_b[3 * k]); + } + + // A matrix - based svdot + int16_t acc_index = 0; + // r0 + a0i = svld1_s16(prow, (const int16_t *)&p_src_a[0]); + compute_blocked_row(acc, acc_index, a0i, bi0, bi1, bi2, bi3); + acc_index += 8; + if constexpr (rows_a > 1) { + // r1 + a1i = svld1_s16(prow, (const int16_t *)&p_src_a[k]); + compute_blocked_row(acc, acc_index, a1i, bi0, bi1, bi2, bi3); + acc_index += 8; + } + if constexpr (rows_a > 2) { + // r2 + a2i = svld1_s16(prow, (const int16_t *)&p_src_a[2 * k]); + compute_blocked_row(acc, acc_index, a2i, bi0, bi1, bi2, bi3); + acc_index += 8; + } + if constexpr (rows_a > 3) { + // r3 + a3i = svld1_s16(prow, (const int16_t *)&p_src_a[3 * k]); + compute_blocked_row(acc, acc_index, a3i, bi0, bi1, bi2, bi3); + acc_index += 8; + } +} + +template +inline void update_dst_based_on_rows_b(armral_cmplx_int16_t *dst, + svint64_t *acc[], int16_t &dst_index, + int16_t &acc_index) { + dst[dst_index].re = + (int16_t)(sat((svaddv_s64(svptrue_b64(), *acc[acc_index]) >> 15))); + dst[dst_index].im = + (int16_t)(sat((svaddv_s64(svptrue_b64(), *acc[acc_index + 1]) >> 15))); + dst_index++; + acc_index += 2; + + if constexpr (rows_b > 1) { + dst[dst_index].re = + (int16_t)(sat((svaddv_s64(svptrue_b64(), *acc[acc_index]) >> 15))); + dst[dst_index].im = + (int16_t)(sat((svaddv_s64(svptrue_b64(), *acc[acc_index + 1]) >> 15))); + } + dst_index++; + acc_index += 2; + + if constexpr (rows_b > 2) { + dst[dst_index].re = + (int16_t)(sat((svaddv_s64(svptrue_b64(), *acc[acc_index]) >> 15))); + dst[dst_index].im = + (int16_t)(sat((svaddv_s64(svptrue_b64(), *acc[acc_index + 1]) >> 15))); + } + dst_index++; + acc_index += 2; + + if constexpr (rows_b > 3) { + dst[dst_index].re = + (int16_t)(sat((svaddv_s64(svptrue_b64(), *acc[acc_index]) >> 15))); + dst[dst_index].im = + (int16_t)(sat((svaddv_s64(svptrue_b64(), *acc[acc_index + 1]) >> 15))); + } + dst_index++; + acc_index += 2; +} + +template +inline void update_dst(uint16_t n, armral_cmplx_int16_t *dst, + svint64_t *acc[]) { + int16_t acc_index = 0; + int16_t dst_index = 0; + // Update dst based on rows_a + // r0 + update_dst_based_on_rows_b(dst, acc, dst_index, acc_index); + if constexpr (rows_a > 1) { + // r1 + dst_index = n; + update_dst_based_on_rows_b(dst, acc, dst_index, acc_index); + if constexpr (rows_a > 2) { + // r2 + dst_index = 2 * n; + update_dst_based_on_rows_b(dst, acc, dst_index, acc_index); + if constexpr (rows_a > 3) { + // r3 + dst_index = 3 * n; + update_dst_based_on_rows_b(dst, acc, dst_index, acc_index); + } + } + } +} + +template +inline void sve_mat_a_b_dot(const uint16_t m, const uint16_t n, + const uint16_t k, + const armral_cmplx_int16_t *__restrict p_src_a, + const armral_cmplx_int16_t *__restrict p_src_b, + armral_cmplx_int16_t *dst) { + // clang-format off + // r0 + svint64_t c00_re = svdup_s64(0); svint64_t c00_im = svdup_s64(0); + svint64_t c01_re = svdup_s64(0); svint64_t c01_im = svdup_s64(0); + svint64_t c02_re = svdup_s64(0); svint64_t c02_im = svdup_s64(0); + svint64_t c03_re = svdup_s64(0); svint64_t c03_im = svdup_s64(0); + + // r1 + svint64_t c10_re = svdup_s64(0); svint64_t c10_im = svdup_s64(0); + svint64_t c11_re = svdup_s64(0); svint64_t c11_im = svdup_s64(0); + svint64_t c12_re = svdup_s64(0); svint64_t c12_im = svdup_s64(0); + svint64_t c13_re = svdup_s64(0); svint64_t c13_im = svdup_s64(0); + + // r2 + svint64_t c20_re = svdup_s64(0); svint64_t c20_im = svdup_s64(0); + svint64_t c21_re = svdup_s64(0); svint64_t c21_im = svdup_s64(0); + svint64_t c22_re = svdup_s64(0); svint64_t c22_im = svdup_s64(0); + svint64_t c23_re = svdup_s64(0); svint64_t c23_im = svdup_s64(0); + + // r3 + svint64_t c30_re = svdup_s64(0); svint64_t c30_im = svdup_s64(0); + svint64_t c31_re = svdup_s64(0); svint64_t c31_im = svdup_s64(0); + svint64_t c32_re = svdup_s64(0); svint64_t c32_im = svdup_s64(0); + svint64_t c33_re = svdup_s64(0); svint64_t c33_im = svdup_s64(0); + // clang-format on + + svint64_t *acc[] = { + &c00_re, &c00_im, &c01_re, &c01_im, &c02_re, &c02_im, &c03_re, &c03_im, + &c10_re, &c10_im, &c11_re, &c11_im, &c12_re, &c12_im, &c13_re, &c13_im, + &c20_re, &c20_im, &c21_re, &c21_im, &c22_re, &c22_im, &c23_re, &c23_im, + &c30_re, &c30_im, &c31_re, &c31_im, &c32_re, &c32_im, &c33_re, &c33_im}; + + uint16_t full_vecs = k / svcntw(); // word - 2*16-bit elements + uint16_t tail = k % svcntw(); + + svbool_t pa = svptrue_b16(); + + uint16_t h = 0; + for (uint16_t done_vecs = 0; done_vecs < full_vecs; + done_vecs++, h += svcntw()) { + compute_full_row(pa, &p_src_a[h], &p_src_b[h], k, acc); + } + if (tail) { + pa = svwhilelt_b16(0, 2 * tail); + compute_full_row(pa, &p_src_a[h], &p_src_b[h], k, acc); + } + update_dst(n, dst, acc); +} +#else inline int16x4_t __attribute__((always_inline)) vzip1_u16x2(int16x4_t a, int16x4_t b) { // should be zip1 c.2s, a.2s, b.2s @@ -24,12 +226,86 @@ vzip2_u16x2(int16x4_t a, int16x4_t b) { return vreinterpret_s16_u32( vset_lane_u32(vreinterpret_u32_s16(a)[1], vreinterpret_u32_s16(b), 0)); } +#endif +template inline armral_status cmplx_matmul_i16(const uint16_t m, const uint16_t n, const uint16_t k, const armral_cmplx_int16_t *__restrict p_src_a, const armral_cmplx_int16_t *__restrict p_src_b, - armral_cmplx_int16_t *p_dst) { + armral_cmplx_int16_t *p_dst, Allocator &allocator) { +#if ARMRAL_ARCH_SVE >= 2 + auto p_transp_b = + allocate_uninitialized(allocator, n * k); + uint16_t cnt = 0; + for (uint16_t i = 0; i < n; i++) { + for (uint16_t j = 0; j < k; j++) { + p_transp_b[cnt++] = p_src_b[j * n + i]; + } + } + + uint16_t i = 0; + for (; i < m - 3; i += 4) { + uint16_t j = 0; + for (; j < n - 3; j += 4) { + sve_mat_a_b_dot<4, 4>(m, n, k, &p_src_a[i * k], &p_transp_b[j * k], + &p_dst[i * n + j]); + } + + for (; j < n - 1; j += 2) { + sve_mat_a_b_dot<4, 2>(m, n, k, &p_src_a[i * k], &p_transp_b[j * k], + &p_dst[i * n + j]); + } + + // If n is odd, we have one more row/col to go + if (n % 2) { + j = n - 1; + sve_mat_a_b_dot<4, 1>(m, n, k, &p_src_a[i * k], &p_transp_b[j * k], + &p_dst[i * n + j]); + } + } + + for (; i < m - 1; i += 2) { + uint16_t j = 0; + for (; j < n - 3; j += 4) { + sve_mat_a_b_dot<2, 4>(m, n, k, &p_src_a[i * k], &p_transp_b[j * k], + &p_dst[i * n + j]); + } + + for (; j < n - 1; j += 2) { + sve_mat_a_b_dot<2, 2>(m, n, k, &p_src_a[i * k], &p_transp_b[j * k], + &p_dst[i * n + j]); + } + + // If n is odd, we have one more row/col to go + if (n % 2) { + j = n - 1; + sve_mat_a_b_dot<2, 1>(m, n, k, &p_src_a[i * k], &p_transp_b[j * k], + &p_dst[i * n + j]); + } + } + + if (m % 2) { + i = m - 1; + uint16_t j = 0; + for (; j < n - 3; j += 4) { + sve_mat_a_b_dot<1, 4>(m, n, k, &p_src_a[i * k], &p_transp_b[j * k], + &p_dst[i * n + j]); + } + + for (; j < n - 1; j += 2) { + sve_mat_a_b_dot<1, 2>(m, n, k, &p_src_a[i * k], &p_transp_b[j * k], + &p_dst[i * n + j]); + } + + // If n is odd, we have one more row/col to go + if (n % 2) { + j = n - 1; + sve_mat_a_b_dot<1, 1>(m, n, k, &p_src_a[i * k], &p_transp_b[j * k], + &p_dst[i * n + j]); + } + } +#else const int16_t *p_in1 = (const int16_t *)p_src_a; const armral_cmplx_int16_t *p_in_a = p_src_a; armral_cmplx_int16_t *p_out = p_dst; @@ -881,7 +1157,7 @@ cmplx_matmul_i16(const uint16_t m, const uint16_t n, const uint16_t k, col--; } } - +#endif return ARMRAL_SUCCESS; } } // anonymous namespace diff --git a/src/BasicMathFun/MatrixMult/cmplx_matmul_i16_32bit.hpp b/src/BasicMathFun/MatrixMult/cmplx_matmul_i16_32bit.hpp index f734fb6eb6e486cb4e29e0966c1fc8ead82fd595..87acc068808e029ae7ff084d483c587934c411de 100644 --- a/src/BasicMathFun/MatrixMult/cmplx_matmul_i16_32bit.hpp +++ b/src/BasicMathFun/MatrixMult/cmplx_matmul_i16_32bit.hpp @@ -5,999 +5,298 @@ #pragma once #include "armral.h" +#if ARMRAL_ARCH_SVE >= 2 +#include +#endif namespace { typedef struct { - int32_t re; - int32_t im; + int32_t re; ///< 32-bit real component. + int32_t im; ///< 32-bit imaginary component. } cmplx_int32_t; -void armral_cmplx_matmul_i16_32bit_2xkx4( - uint16_t k, const armral_cmplx_int16_t *__restrict a, - const armral_cmplx_int16_t *__restrict b, int ldb, - armral_cmplx_int16_t *dst) { - // Performs the multiplication of a row of matrix A by a set of four columns - // of matrix B. It is assumed that B has four columns and no bounds checking - // is done. Equally, it is assumed that the pointer to A is for a row vector - // of length k. - - const int16_t *a_int16 = (const int16_t *)a; - const int16_t *b_int16 = (const int16_t *)b; - // Accumulators for the real and imaginary components of the first row - int32x4_t real_acc[4] = {0}; - int32x4_t imag_acc[4] = {0}; - // Accumulators for the real and imaginary components of the second row - int32x4_t real_acc1[4] = {0}; - int32x4_t imag_acc1[4] = {0}; - - // Loop over k in blocks of 8 - for (int blk8 = k >> 3; blk8 > 0; - --blk8, a_int16 += 16, b_int16 += 16 * ldb) { - // Load 8 complex numbers from A into arrays of real and complex components - int16x8x2_t a_row0 = vld2q_s16(a_int16); - int16x8x2_t a_row1 = vld2q_s16(a_int16 + 2 * k); - - // Load 8 rows from B - int16x8_t b_tmp[8] = {0}; - for (int i = 0; i < 8; ++i) { - b_tmp[i] = vld1q_s16(b_int16 + 2 * i * ldb); - } - - // We now want to transpose the 8x4 matrix of complex numbers, and - // de-interleave into real and complex components - int16x8_t real_bs[4] = {0}; - int16x8_t imag_bs[4] = {0}; - // We first separate out the real and imaginary components - for (int i = 0; i < 4; ++i) { - real_bs[i] = vtrn1q_s16(b_tmp[2 * i], b_tmp[2 * i + 1]); - imag_bs[i] = vtrn2q_s16(b_tmp[2 * i], b_tmp[2 * i + 1]); - } - - // Now we interleave pairs of real numbers to start to get them in order. - // For example, for the first two real vectors we have - // ^: [r_0, r_1, r_k, r_k+1, r_2k, r_2k+1, r_3k, r_3k+1] = trn1(v0, v1) - // ^: [r_2, r_3, r_k+2, r_k+3, r_2k+2, r_2k+3, r_3k+2, r_3k+3] = trn1(v2, v3) - // ^: zip1(trn1(v0, v1), trn1(v2, v3)) = [r_0, r_1, r_2, r_3, r_k, r_k+1, r_k+2, r_k+3] - for (int i = 0; i < 2; ++i) { - b_tmp[2 * i] = vreinterpretq_s16_u32( - vzip1q_u32(vreinterpretq_u32_s16(real_bs[2 * i]), - vreinterpretq_u32_s16(real_bs[2 * i + 1]))); - b_tmp[2 * i + 1] = vreinterpretq_s16_u32( - vzip2q_u32(vreinterpretq_u32_s16(real_bs[2 * i]), - vreinterpretq_u32_s16(real_bs[2 * i + 1]))); - - b_tmp[4 + 2 * i] = vreinterpretq_s16_u32( - vzip1q_u32(vreinterpretq_u32_s16(imag_bs[2 * i]), - vreinterpretq_u32_s16(imag_bs[2 * i + 1]))); - b_tmp[4 + 2 * i + 1] = vreinterpretq_s16_u32( - vzip2q_u32(vreinterpretq_u32_s16(imag_bs[2 * i]), - vreinterpretq_u32_s16(imag_bs[2 * i + 1]))); - } - - for (int i = 0; i < 2; ++i) { - real_bs[2 * i] = vreinterpretq_s16_u64( - vzip1q_u64(vreinterpretq_u64_s16(b_tmp[i]), - vreinterpretq_u64_s16(b_tmp[i + 2]))); - real_bs[2 * i + 1] = vreinterpretq_s16_u64( - vzip2q_u64(vreinterpretq_u64_s16(b_tmp[i]), - vreinterpretq_u64_s16(b_tmp[i + 2]))); - - imag_bs[2 * i] = vreinterpretq_s16_u64( - vzip1q_u64(vreinterpretq_u64_s16(b_tmp[4 + i]), - vreinterpretq_u64_s16(b_tmp[i + 6]))); - imag_bs[2 * i + 1] = vreinterpretq_s16_u64( - vzip2q_u64(vreinterpretq_u64_s16(b_tmp[4 + i]), - vreinterpretq_u64_s16(b_tmp[i + 6]))); - } - - // Now perform the dot product of the row of A with columns of B, expanding - // to 32 bits Row 1 - for (int i = 0; i < 4; ++i) { - // Real * real - real_acc[i] = vqdmlal_s16(real_acc[i], vget_low_s16(a_row0.val[0]), - vget_low_s16(real_bs[i])); - real_acc[i] = vqdmlal_high_s16(real_acc[i], a_row0.val[0], real_bs[i]); - // Imag * imag - real_acc[i] = vqdmlsl_s16(real_acc[i], vget_low_s16(a_row0.val[1]), - vget_low_s16(imag_bs[i])); - real_acc[i] = vqdmlsl_high_s16(real_acc[i], a_row0.val[1], imag_bs[i]); - // Real * imag - imag_acc[i] = vqdmlal_s16(imag_acc[i], vget_low_s16(a_row0.val[0]), - vget_low_s16(imag_bs[i])); - imag_acc[i] = vqdmlal_high_s16(imag_acc[i], a_row0.val[0], imag_bs[i]); - // Imag * real - imag_acc[i] = vqdmlal_s16(imag_acc[i], vget_low_s16(a_row0.val[1]), - vget_low_s16(real_bs[i])); - imag_acc[i] = vqdmlal_high_s16(imag_acc[i], a_row0.val[1], real_bs[i]); - } - - // Row 2 - for (int i = 0; i < 4; ++i) { - // Real * real - real_acc1[i] = vqdmlal_s16(real_acc1[i], vget_low_s16(a_row1.val[0]), - vget_low_s16(real_bs[i])); - real_acc1[i] = vqdmlal_high_s16(real_acc1[i], a_row1.val[0], real_bs[i]); - // Imag * imag - real_acc1[i] = vqdmlsl_s16(real_acc1[i], vget_low_s16(a_row1.val[1]), - vget_low_s16(imag_bs[i])); - real_acc1[i] = vqdmlsl_high_s16(real_acc1[i], a_row1.val[1], imag_bs[i]); - // Real * imag - imag_acc1[i] = vqdmlal_s16(imag_acc1[i], vget_low_s16(a_row1.val[0]), - vget_low_s16(imag_bs[i])); - imag_acc1[i] = vqdmlal_high_s16(imag_acc1[i], a_row1.val[0], imag_bs[i]); - // Imag * real - imag_acc1[i] = vqdmlal_s16(imag_acc1[i], vget_low_s16(a_row1.val[1]), - vget_low_s16(real_bs[i])); - imag_acc1[i] = vqdmlal_high_s16(imag_acc1[i], a_row1.val[1], real_bs[i]); - } - } - - if ((k & 4) != 0) { - // Load 4 complex numbers from A at a time - int16x4x2_t a_tmp = vld2_s16(a_int16); - int16x8_t a_row0 = vcombine_s16(a_tmp.val[0], a_tmp.val[1]); - a_tmp = vld2_s16(a_int16 + 2 * k); - int16x8_t a_row1 = vcombine_s16(a_tmp.val[0], a_tmp.val[1]); - - // Load 4 rows from B - int16x8_t b_vals[4] = {0}; - for (int i = 0; i < 4; ++i) { - b_vals[i] = vld1q_s16(b_int16 + 2 * i * ldb); +#if ARMRAL_ARCH_SVE >= 2 +template +inline void +cmplx_matmul_i16_32bit_ixkxj_sve(svbool_t pb, const uint16_t m, + const uint16_t n, const uint16_t k, + const armral_cmplx_int16_t *__restrict p_src_a, + const armral_cmplx_int16_t *__restrict p_src_b, + armral_cmplx_int16_t *p_dst) { + + // Only valid for these values of rows_a + static_assert(rows_a == 1 || rows_a == 2); + + // clang-format off + svint16_t a0i_0_3 = svdup_s16(0); svint16_t a1i_0_3 = svdup_s16(0); + + svint16_t b0i = svdup_s16(0); svint16_t b1i = svdup_s16(0); + svint16_t b2i = svdup_s16(0); svint16_t b3i = svdup_s16(0); + + svint32_t c0_re = svdup_s32(0); svint32_t c0_im = svdup_s32(0); + svint32_t c1_re = svdup_s32(0); svint32_t c1_im = svdup_s32(0); + // clang-format on + + for (uint16_t h = 0; h < k;) { + svbool_t pa = svwhilelt_b16(2 * h, 2 * k); + + a0i_0_3 = svld1_s16(pa, (const int16_t *)&p_src_a[h]); + + if constexpr (rows_a > 1) { + a1i_0_3 = svld1_s16(pa, (const int16_t *)&p_src_a[h + k]); + } + + b0i = svld1_s16(pb, (const int16_t *)&p_src_b[h * n]); + if (h < k - 1) { + b1i = svld1_s16(pb, (const int16_t *)&p_src_b[(h + 1) * n]); + if (h < k - 3) { + b2i = svld1_s16(pb, (const int16_t *)&p_src_b[(h + 2) * n]); + b3i = svld1_s16(pb, (const int16_t *)&p_src_b[(h + 3) * n]); + } + } + c0_re = svqdmlalb_lane_s32(c0_re, b0i, a0i_0_3, 0); + c0_re = svqdmlslt_lane_s32(c0_re, b0i, a0i_0_3, 1); + c0_im = svqdmlalt_lane_s32(c0_im, b0i, a0i_0_3, 0); + c0_im = svqdmlalb_lane_s32(c0_im, b0i, a0i_0_3, 1); + if (h < k - 1) { + c0_re = svqdmlalb_lane_s32(c0_re, b1i, a0i_0_3, 2); + c0_re = svqdmlslt_lane_s32(c0_re, b1i, a0i_0_3, 3); + c0_im = svqdmlalt_lane_s32(c0_im, b1i, a0i_0_3, 2); + c0_im = svqdmlalb_lane_s32(c0_im, b1i, a0i_0_3, 3); + if (h < k - 3) { + c0_re = svqdmlalb_lane_s32(c0_re, b2i, a0i_0_3, 4); + c0_re = svqdmlslt_lane_s32(c0_re, b2i, a0i_0_3, 5); + c0_im = svqdmlalt_lane_s32(c0_im, b2i, a0i_0_3, 4); + c0_im = svqdmlalb_lane_s32(c0_im, b2i, a0i_0_3, 5); + c0_re = svqdmlalb_lane_s32(c0_re, b3i, a0i_0_3, 6); + c0_re = svqdmlslt_lane_s32(c0_re, b3i, a0i_0_3, 7); + c0_im = svqdmlalt_lane_s32(c0_im, b3i, a0i_0_3, 6); + c0_im = svqdmlalb_lane_s32(c0_im, b3i, a0i_0_3, 7); + } + } + if constexpr (rows_a > 1) { + c1_re = svqdmlalb_lane_s32(c1_re, b0i, a1i_0_3, 0); + c1_re = svqdmlslt_lane_s32(c1_re, b0i, a1i_0_3, 1); + c1_im = svqdmlalt_lane_s32(c1_im, b0i, a1i_0_3, 0); + c1_im = svqdmlalb_lane_s32(c1_im, b0i, a1i_0_3, 1); + if (h < k - 1) { + c1_re = svqdmlalb_lane_s32(c1_re, b1i, a1i_0_3, 2); + c1_re = svqdmlslt_lane_s32(c1_re, b1i, a1i_0_3, 3); + c1_im = svqdmlalt_lane_s32(c1_im, b1i, a1i_0_3, 2); + c1_im = svqdmlalb_lane_s32(c1_im, b1i, a1i_0_3, 3); + if (h < k - 3) { + c1_re = svqdmlalb_lane_s32(c1_re, b2i, a1i_0_3, 4); + c1_re = svqdmlslt_lane_s32(c1_re, b2i, a1i_0_3, 5); + c1_im = svqdmlalt_lane_s32(c1_im, b2i, a1i_0_3, 4); + c1_im = svqdmlalb_lane_s32(c1_im, b2i, a1i_0_3, 5); + c1_re = svqdmlalb_lane_s32(c1_re, b3i, a1i_0_3, 6); + c1_re = svqdmlslt_lane_s32(c1_re, b3i, a1i_0_3, 7); + c1_im = svqdmlalt_lane_s32(c1_im, b3i, a1i_0_3, 6); + c1_im = svqdmlalb_lane_s32(c1_im, b3i, a1i_0_3, 7); + } + } + } + if (h < k - 1) { + if (h < k - 3) { + h += 4; + } else { + h += 2; + } + } else { + h += 1; } - - // Separate out the real and imaginary components and reorder the columns - // of b_tmp as rows. Real components go into the low half of int16x8 - // vectors, and imaginary components to the high half - int16x8_t b_tmp[4] = {0}; - for (int i = 0; i < 2; ++i) { - b_tmp[2 * i] = vzip1q_s16(b_vals[2 * i], b_vals[2 * i + 1]); - b_tmp[2 * i + 1] = vzip2q_s16(b_vals[2 * i], b_vals[2 * i + 1]); - } - - for (int i = 0; i < 2; ++i) { - b_vals[2 * i] = vreinterpretq_s16_u32( - vzip1q_u32(vreinterpretq_u32_s16(b_tmp[i]), - vreinterpretq_u32_s16(b_tmp[i + 2]))); - b_vals[2 * i + 1] = vreinterpretq_s16_u32( - vzip2q_u32(vreinterpretq_u32_s16(b_tmp[i]), - vreinterpretq_u32_s16(b_tmp[i + 2]))); - } - - // Now do the multiplication - // Row 1 - for (int i = 0; i < 4; ++i) { - // Real * real - real_acc[i] = vqdmlal_s16(real_acc[i], vget_low_s16(a_row0), - vget_low_s16(b_vals[i])); - // Imag * imag - real_acc[i] = vqdmlsl_high_s16(real_acc[i], a_row0, b_vals[i]); - // Real * imag - imag_acc[i] = vqdmlal_s16(imag_acc[i], vget_low_s16(a_row0), - vget_high_s16(b_vals[i])); - // Imag * real - imag_acc[i] = vqdmlal_s16(imag_acc[i], vget_high_s16(a_row0), - vget_low_s16(b_vals[i])); - } - - // Row 2 - for (int i = 0; i < 4; ++i) { - // Real * real - real_acc1[i] = vqdmlal_s16(real_acc1[i], vget_low_s16(a_row1), - vget_low_s16(b_vals[i])); - // Imag * imag - real_acc1[i] = vqdmlsl_high_s16(real_acc1[i], a_row1, b_vals[i]); - // Real * imag - imag_acc1[i] = vqdmlal_s16(imag_acc1[i], vget_low_s16(a_row1), - vget_high_s16(b_vals[i])); - // Imag * real - imag_acc1[i] = vqdmlal_s16(imag_acc1[i], vget_high_s16(a_row1), - vget_low_s16(b_vals[i])); - } - - a_int16 += 8; - b_int16 += 8 * ldb; - } - - int32x4_t sum_32_0 = vdupq_n_s32(0); - int32x4_t sum_32_1 = vdupq_n_s32(0); - int32x4_t sum_32_2 = vdupq_n_s32(0); - int32x4_t sum_32_3 = vdupq_n_s32(0); - for (int i = 0; i < 4; ++i) { - sum_32_0[i] = vqmovnd_s64(vaddlvq_s32(real_acc[i])); - sum_32_1[i] = vqmovnd_s64(vaddlvq_s32(imag_acc[i])); - sum_32_2[i] = vqmovnd_s64(vaddlvq_s32(real_acc1[i])); - sum_32_3[i] = vqmovnd_s64(vaddlvq_s32(imag_acc1[i])); } - for (int a_col_cnt = k & 3; a_col_cnt > 0; - --a_col_cnt, a_int16 += 2, b_int16 += 2 * ldb) { - armral_cmplx_int16_t a_row0 = *((const armral_cmplx_int16_t *)a_int16); - armral_cmplx_int16_t a_row1 = - *((const armral_cmplx_int16_t *)(a_int16 + 2 * k)); + // saturate and right shift + svint16_t res_r0 = svshrnt(svshrnb(c0_re, 16), c0_im, 16); + // store the result in dst + svst1_s16(pb, (int16_t *)&p_dst[0], res_r0); - // Load four columns from B - int16x8_t b_vals = vld1q_s16(b_int16); - - for (int i = 0; i < 4; ++i) { - // Real * real - sum_32_0[i] = vqdmlalh_s16(sum_32_0[i], a_row0.re, b_vals[2 * i]); - sum_32_2[i] = vqdmlalh_s16(sum_32_2[i], a_row1.re, b_vals[2 * i]); - - // Imag * imag - sum_32_0[i] = vqdmlslh_s16(sum_32_0[i], a_row0.im, b_vals[2 * i + 1]); - sum_32_2[i] = vqdmlslh_s16(sum_32_2[i], a_row1.im, b_vals[2 * i + 1]); - - // Real * imag - sum_32_1[i] = vqdmlalh_s16(sum_32_1[i], a_row0.re, b_vals[2 * i + 1]); - sum_32_3[i] = vqdmlalh_s16(sum_32_3[i], a_row1.re, b_vals[2 * i + 1]); - - // Imag * real - sum_32_1[i] = vqdmlalh_s16(sum_32_1[i], a_row0.im, b_vals[2 * i]); - sum_32_3[i] = vqdmlalh_s16(sum_32_3[i], a_row1.im, b_vals[2 * i]); - } + // one more row + if constexpr (rows_a > 1) { + svint16_t res_r1 = svshrnt(svshrnb(c1_re, 16), c1_im, 16); + svst1_s16(pb, (int16_t *)&p_dst[n], res_r1); } - - // Now that we have the sum in 32 bit integer in Q0.31 format, we need to get - // the result back into a Q0.15 16-bit integer. We shift to the right and - // narrow, and we do this on {re, im} pairs of numbers. - - // Zip real and imaginary components - int32x4_t out_32bit[4] = {0}; - out_32bit[0] = vzip1q_s32(sum_32_0, sum_32_1); - out_32bit[1] = vzip2q_s32(sum_32_0, sum_32_1); - out_32bit[2] = vzip1q_s32(sum_32_2, sum_32_3); - out_32bit[3] = vzip2q_s32(sum_32_2, sum_32_3); - - // Narrow to 16-bit - int16x8_t out[2] = {0}; - out[0] = vqshrn_high_n_s32(vqshrn_n_s32(out_32bit[0], 16), out_32bit[1], 16); - out[1] = vqshrn_high_n_s32(vqshrn_n_s32(out_32bit[2], 16), out_32bit[3], 16); - - // Now write to the destination array - vst1q_s16((int16_t *)dst, out[0]); - vst1q_s16((int16_t *)(dst + ldb), out[1]); } +#endif -void armral_cmplx_matmul_i16_32bit_1xkx4( - uint16_t k, const armral_cmplx_int16_t *__restrict a, - const armral_cmplx_int16_t *__restrict b, int ldb, - armral_cmplx_int16_t *dst) { - // Performs the multiplication of a row of matrix A by a set of four columns - // of matrix B. It is assumed that B has four columns and no bounds checking - // is done. Equally, it is assumed that the pointer to A is for a row vector - // of length k. +// nb must be even +template +void cmplx_matmul_i16_32bit_mbxkxnb_even( + const uint16_t k, const armral_cmplx_int16_t *const __restrict a, + const uint16_t lda, const armral_cmplx_int16_t *const __restrict b, + armral_cmplx_int16_t *const dst, const uint16_t ldb) { const int16_t *a_int16 = (const int16_t *)a; const int16_t *b_int16 = (const int16_t *)b; - // Accumulators for the real and imaginary components - int32x4_t real_acc[4] = {0}; - int32x4_t imag_acc[4] = {0}; - - // Loop over k in blocks of 8 - for (int blk8 = k >> 3; blk8 > 0; - --blk8, a_int16 += 16, b_int16 += 16 * ldb) { - // Load 8 complex numbers from A into arrays of real and complex components - int16x8x2_t a_row = vld2q_s16(a_int16); - // Load 8 rows from B - int16x8_t b_tmp[8] = {0}; - for (int i = 0; i < 8; ++i) { - b_tmp[i] = vld1q_s16(b_int16 + 2 * i * ldb); - } - - // We now want to transpose the 8x4 matrix of complex numbers, and - // de-interleave into real and complex components - int16x8_t real_bs[4] = {0}; - int16x8_t imag_bs[4] = {0}; - // We first separate out the real and imaginary components - for (int i = 0; i < 4; ++i) { - real_bs[i] = vtrn1q_s16(b_tmp[2 * i], b_tmp[2 * i + 1]); - imag_bs[i] = vtrn2q_s16(b_tmp[2 * i], b_tmp[2 * i + 1]); - } - - // Now we interleave pairs of real numbers to start to get them in order. - // For example, for the first two real vectors we have - // ^: [r_0, r_1, r_k, r_k+1, r_2k, r_2k+1, r_3k, r_3k+1] = trn1(v0, v1) - // ^: [r_2, r_3, r_k+2, r_k+3, r_2k+2, r_2k+3, r_3k+2, r_3k+3] = trn1(v2, v3) - // ^: zip1(trn1(v0, v1), trn1(v2, v3)) = [r_0, r_1, r_2, r_3, r_k, r_k+1, r_k+2, r_k+3] - for (int i = 0; i < 2; ++i) { - b_tmp[2 * i] = vreinterpretq_s16_u32( - vzip1q_u32(vreinterpretq_u32_s16(real_bs[2 * i]), - vreinterpretq_u32_s16(real_bs[2 * i + 1]))); - b_tmp[2 * i + 1] = vreinterpretq_s16_u32( - vzip2q_u32(vreinterpretq_u32_s16(real_bs[2 * i]), - vreinterpretq_u32_s16(real_bs[2 * i + 1]))); - - b_tmp[4 + 2 * i] = vreinterpretq_s16_u32( - vzip1q_u32(vreinterpretq_u32_s16(imag_bs[2 * i]), - vreinterpretq_u32_s16(imag_bs[2 * i + 1]))); - b_tmp[4 + 2 * i + 1] = vreinterpretq_s16_u32( - vzip2q_u32(vreinterpretq_u32_s16(imag_bs[2 * i]), - vreinterpretq_u32_s16(imag_bs[2 * i + 1]))); - } - - for (int i = 0; i < 2; ++i) { - real_bs[2 * i] = vreinterpretq_s16_u64( - vzip1q_u64(vreinterpretq_u64_s16(b_tmp[i]), - vreinterpretq_u64_s16(b_tmp[i + 2]))); - real_bs[2 * i + 1] = vreinterpretq_s16_u64( - vzip2q_u64(vreinterpretq_u64_s16(b_tmp[i]), - vreinterpretq_u64_s16(b_tmp[i + 2]))); - - imag_bs[2 * i] = vreinterpretq_s16_u64( - vzip1q_u64(vreinterpretq_u64_s16(b_tmp[4 + i]), - vreinterpretq_u64_s16(b_tmp[i + 6]))); - imag_bs[2 * i + 1] = vreinterpretq_s16_u64( - vzip2q_u64(vreinterpretq_u64_s16(b_tmp[4 + i]), - vreinterpretq_u64_s16(b_tmp[i + 6]))); - } - - // Now perform the dot product of the row of A with columns of B, expanding - // to 32 bits - for (int i = 0; i < 4; ++i) { - // Real * real - real_acc[i] = vqdmlal_s16(real_acc[i], vget_low_s16(a_row.val[0]), - vget_low_s16(real_bs[i])); - real_acc[i] = vqdmlal_high_s16(real_acc[i], a_row.val[0], real_bs[i]); - // Imag * imag - real_acc[i] = vqdmlsl_s16(real_acc[i], vget_low_s16(a_row.val[1]), - vget_low_s16(imag_bs[i])); - real_acc[i] = vqdmlsl_high_s16(real_acc[i], a_row.val[1], imag_bs[i]); - // Real * imag - imag_acc[i] = vqdmlal_s16(imag_acc[i], vget_low_s16(a_row.val[0]), - vget_low_s16(imag_bs[i])); - imag_acc[i] = vqdmlal_high_s16(imag_acc[i], a_row.val[0], imag_bs[i]); - // Imag * real - imag_acc[i] = vqdmlal_s16(imag_acc[i], vget_low_s16(a_row.val[1]), - vget_low_s16(real_bs[i])); - imag_acc[i] = vqdmlal_high_s16(imag_acc[i], a_row.val[1], real_bs[i]); + // We will be vectorizing in nblocks of 4, using int??x4_ts. Each vector will + // therefore hold two complex numbers, so we will need nb/2 vectors (per m) + constexpr uint16_t nb_vecs = nb / 2; + + int32x4_t buff[mb][nb_vecs] = {}; + int16x8_t a_rows[mb]; + int16x4x2_t a_val[mb]; + int16x4_t b_row_nrm[nb_vecs]; + int16x4_t b_row_rev[nb_vecs]; + + constexpr uint16_t kb = 4; // load 4 complex A vals at a time + uint16_t kbi = 0; + for (; kbi < k - kb + 1; kbi += kb) { + + for (uint16_t mi = 0; mi < mb; ++mi) { + a_rows[mi] = vld1q_s16(a_int16 + 2 * mi * lda + 2 * kbi); + } + + for (uint16_t ki = 0; ki < kb; ++ki) { + + // a_val holds the single complex value of A we will be multiplying the + // row of B by. + // .val[0] is real part, .val[1] the imag part (with alternating sign) + for (uint16_t mi = 0; mi < mb; ++mi) { + a_val[mi].val[0] = vdup_n_s16(a_rows[mi][2 * ki]); + a_val[mi].val[1] = vdup_n_s16(a_rows[mi][2 * ki + 1]); + a_val[mi].val[1] = + vzip1_s16(vneg_s16(a_val[mi].val[1]), a_val[mi].val[1]); + } + + // b_row_nrm is 2 complex numbers from the row of B + // b_row_rev is the same, but with each num's real and imag part switched. + for (uint16_t nvi = 0; nvi < nb_vecs; ++nvi) { + b_row_nrm[nvi] = vld1_s16(b_int16 + 2 * (kbi + ki) * ldb + 4 * nvi); + b_row_rev[nvi] = vrev32_s16(b_row_nrm[nvi]); + } + + // Do the multiplication. + // eg. first two cmplx numbers of C: + // C_r0 | C_i0 | C_r1 | C_i0 = buff + // ======|======|======|===== + // A_r0 | A_r0 | A_r0 | A_r0 = a_val.val[0] + // * | * | * | * + // B_r0 | B_i0 | B_r1 | B_i1 = b_row_nrm + // + | + | + | + + // -A_i0 | A_i0 |-A_i0 | A_i0 = a_val.val[1] + // * | * | * | * + // B_i0 | B_r0 | B_i1 | B_r1 = b_row_rev + for (uint16_t mi = 0; mi < mb; ++mi) { + for (uint16_t nvi = 0; nvi < nb_vecs; ++nvi) { + buff[mi][nvi] = + vqdmlal_s16(buff[mi][nvi], a_val[mi].val[0], b_row_nrm[nvi]); + buff[mi][nvi] = + vqdmlal_s16(buff[mi][nvi], a_val[mi].val[1], b_row_rev[nvi]); + } + } } } + for (; kbi < k; kbi++) { - if ((k & 4) != 0) { - // Load 4 complex numbers from A at a time - int16x4x2_t a_row = vld2_s16(a_int16); - - // Load 4 rows from B - int16x8_t b_vals[4] = {0}; - for (int i = 0; i < 4; ++i) { - b_vals[i] = vld1q_s16(b_int16 + 2 * i * ldb); + for (uint16_t mi = 0; mi < mb; ++mi) { + a_val[mi] = vld2_dup_s16(a_int16 + 2 * kbi + 2 * mi * lda); + a_val[mi].val[1] = + vzip1_s16(vneg_s16(a_val[mi].val[1]), a_val[mi].val[1]); } - // Separate out the real and imaginary components and reorder the columns of - // what was read in as rows. Real components go into the low half of int16x8 - // vectors, and imaginary components to the high half - int16x8_t b_tmp[4] = {0}; - for (int i = 0; i < 2; ++i) { - b_tmp[2 * i] = vzip1q_s16(b_vals[2 * i], b_vals[2 * i + 1]); - b_tmp[2 * i + 1] = vzip2q_s16(b_vals[2 * i], b_vals[2 * i + 1]); + for (uint16_t nvi = 0; nvi < nb_vecs; ++nvi) { + b_row_nrm[nvi] = vld1_s16(b_int16 + 2 * kbi * ldb + 4 * nvi); + b_row_rev[nvi] = vrev32_s16(b_row_nrm[nvi]); } - for (int i = 0; i < 2; ++i) { - b_vals[2 * i] = vreinterpretq_s16_u32( - vzip1q_u32(vreinterpretq_u32_s16(b_tmp[i]), - vreinterpretq_u32_s16(b_tmp[i + 2]))); - b_vals[2 * i + 1] = vreinterpretq_s16_u32( - vzip2q_u32(vreinterpretq_u32_s16(b_tmp[i]), - vreinterpretq_u32_s16(b_tmp[i + 2]))); + for (uint16_t mi = 0; mi < mb; ++mi) { + for (uint16_t nvi = 0; nvi < nb_vecs; ++nvi) { + buff[mi][nvi] = + vqdmlal_s16(buff[mi][nvi], a_val[mi].val[0], b_row_nrm[nvi]); + buff[mi][nvi] = + vqdmlal_s16(buff[mi][nvi], a_val[mi].val[1], b_row_rev[nvi]); + } } - - // Now do the multiplication - for (int i = 0; i < 4; ++i) { - // Real * real - real_acc[i] = - vqdmlal_s16(real_acc[i], a_row.val[0], vget_low_s16(b_vals[i])); - // Imag * imag - real_acc[i] = - vqdmlsl_s16(real_acc[i], a_row.val[1], vget_high_s16(b_vals[i])); - // Real * imag - imag_acc[i] = - vqdmlal_s16(imag_acc[i], a_row.val[0], vget_high_s16(b_vals[i])); - // Imag * real - imag_acc[i] = - vqdmlal_s16(imag_acc[i], a_row.val[1], vget_low_s16(b_vals[i])); - } - - a_int16 += 8; - b_int16 += 8 * ldb; - } - - int32x4_t sum_32_0 = vdupq_n_s32(0); - int32x4_t sum_32_1 = vdupq_n_s32(0); - for (int i = 0; i < 4; ++i) { - sum_32_0[i] = vqmovnd_s64(vaddlvq_s32(real_acc[i])); - sum_32_1[i] = vqmovnd_s64(vaddlvq_s32(imag_acc[i])); } - for (int a_col_cnt = k & 3; a_col_cnt > 0; - --a_col_cnt, a_int16 += 2, b_int16 += 2 * ldb) { - armral_cmplx_int16_t a_row = *((const armral_cmplx_int16_t *)a_int16); - - // Load four columns from B - int16x8_t b_vals = vld1q_s16(b_int16); - - for (int i = 0; i < 4; ++i) { - // Real * real - sum_32_0[i] = vqdmlalh_s16(sum_32_0[i], a_row.re, b_vals[2 * i]); - // Imag * imag - sum_32_0[i] = vqdmlslh_s16(sum_32_0[i], a_row.im, b_vals[2 * i + 1]); - // Real * imag - sum_32_1[i] = vqdmlalh_s16(sum_32_1[i], a_row.re, b_vals[2 * i + 1]); - // Imag * real - sum_32_1[i] = vqdmlalh_s16(sum_32_1[i], a_row.im, b_vals[2 * i]); + // Write to dst + for (uint16_t mi = 0; mi < mb; ++mi) { + for (uint16_t nvi = 0; nvi < nb_vecs; ++nvi) { + int16x4_t out = vqshrn_n_s32(buff[mi][nvi], 16); + vst1_s16(((int16_t *)dst) + mi * 2 * ldb + 4 * nvi, out); } } - - // Now that we have the sum in 32 bit integer in Q0.31 format, we need to get - // the result back into a Q0.15 16-bit integer. We shift to the right and - // narrow, and we do this on {re, im} pairs of numbers. - - // Zip real and imaginary components - int32x4_t out_32bit[2] = {0}; - out_32bit[0] = vzip1q_s32(sum_32_0, sum_32_1); - out_32bit[1] = vzip2q_s32(sum_32_0, sum_32_1); - - // Narrow to 16-bit - int16x8_t out = {0}; - out = vqshrn_high_n_s32(vqshrn_n_s32(out_32bit[0], 16), out_32bit[1], 16); - - // Now write to the destination array - vst1q_s16((int16_t *)dst, out); } -void armral_cmplx_matmul_i16_32bit_2xkx2( - uint16_t k, const armral_cmplx_int16_t *__restrict a, - const armral_cmplx_int16_t *__restrict b, int ldb, - armral_cmplx_int16_t *dst) { - // Performs the multiplication of a row of matrix A by a pair of columns of - // matrix B. It is assumed that B has four columns and no bounds checking is - // done. Equally, it is assumed that the pointer to A is for a row vector of - // length k. - - const int16_t *a_int16 = (const int16_t *)a; - const int16_t *b_int16 = (const int16_t *)b; - // Accumulators for the real and imaginary components of the first row - int32x4_t real_acc[2] = {0}; - int32x4_t imag_acc[2] = {0}; - // Accumulators for the real and imaginary components of the second row - int32x4_t real_acc1[2] = {0}; - int32x4_t imag_acc1[2] = {0}; - - // Loop over k in blocks of 8 - for (int blk8 = k >> 3; blk8 > 0; - --blk8, a_int16 += 16, b_int16 += 16 * ldb) { - // Load 8 complex numbers from A into arrays of real and complex components - int16x8x2_t a_row0 = vld2q_s16(a_int16); - int16x8x2_t a_row1 = vld2q_s16(a_int16 + 2 * k); - - // Load 8 rows from B - int16x8_t b_vals[4] = {0}; - for (int i = 0; i < 4; ++i) { - b_vals[i] = vcombine_s16(vld1_s16(b_int16 + 4 * i * ldb), - vld1_s16(b_int16 + (4 * i + 2) * ldb)); - } - - // We now want to transpose the 8x2 matrix of complex numbers, and - // de-interleave into real and complex components - int16x8_t b_tmp[4] = {0}; - for (int i = 0; i < 2; ++i) { - // Real numbers only - b_tmp[2 * i] = vuzp1q_s16(b_vals[2 * i], b_vals[2 * i + 1]); - // Imaginary numbers only - b_tmp[2 * i + 1] = vuzp2q_s16(b_vals[2 * i], b_vals[2 * i + 1]); - } - - // Even indices of b_vals hold real values, odd indices hold imaginary - // values - for (int i = 0; i < 2; ++i) { - b_vals[i] = vuzp1q_s16(b_tmp[i], b_tmp[i + 2]); - b_vals[i + 2] = vuzp2q_s16(b_tmp[i], b_tmp[i + 2]); - } - - // Now perform the dot product of the row of A with columns of B, expanding - // to 32 bits Row 1 - for (int i = 0; i < 2; ++i) { - // Real * real - real_acc[i] = vqdmlal_s16(real_acc[i], vget_low_s16(a_row0.val[0]), - vget_low_s16(b_vals[2 * i])); - real_acc[i] = vqdmlal_high_s16(real_acc[i], a_row0.val[0], b_vals[2 * i]); - // Imag * imag - real_acc[i] = vqdmlsl_s16(real_acc[i], vget_low_s16(a_row0.val[1]), - vget_low_s16(b_vals[2 * i + 1])); - real_acc[i] = - vqdmlsl_high_s16(real_acc[i], a_row0.val[1], b_vals[2 * i + 1]); - // Real * imag - imag_acc[i] = vqdmlal_s16(imag_acc[i], vget_low_s16(a_row0.val[0]), - vget_low_s16(b_vals[2 * i + 1])); - imag_acc[i] = - vqdmlal_high_s16(imag_acc[i], a_row0.val[0], b_vals[2 * i + 1]); - // Imag * real - imag_acc[i] = vqdmlal_s16(imag_acc[i], vget_low_s16(a_row0.val[1]), - vget_low_s16(b_vals[2 * i])); - imag_acc[i] = vqdmlal_high_s16(imag_acc[i], a_row0.val[1], b_vals[2 * i]); - } - - // Row 2 - for (int i = 0; i < 2; ++i) { - // Real * real - real_acc1[i] = vqdmlal_s16(real_acc1[i], vget_low_s16(a_row1.val[0]), - vget_low_s16(b_vals[2 * i])); - real_acc1[i] = - vqdmlal_high_s16(real_acc1[i], a_row1.val[0], b_vals[2 * i]); - // Imag * imag - real_acc1[i] = vqdmlsl_s16(real_acc1[i], vget_low_s16(a_row1.val[1]), - vget_low_s16(b_vals[2 * i + 1])); - real_acc1[i] = - vqdmlsl_high_s16(real_acc1[i], a_row1.val[1], b_vals[2 * i + 1]); - // Real * imag - imag_acc1[i] = vqdmlal_s16(imag_acc1[i], vget_low_s16(a_row1.val[0]), - vget_low_s16(b_vals[2 * i + 1])); - imag_acc1[i] = - vqdmlal_high_s16(imag_acc1[i], a_row1.val[0], b_vals[2 * i + 1]); - // Imag * real - imag_acc1[i] = vqdmlal_s16(imag_acc1[i], vget_low_s16(a_row1.val[1]), - vget_low_s16(b_vals[2 * i])); - imag_acc1[i] = - vqdmlal_high_s16(imag_acc1[i], a_row1.val[1], b_vals[2 * i]); - } - } - - if ((k & 4) != 0) { - // Load 4 complex numbers from A at a time - int16x4x2_t a_tmp = vld2_s16(a_int16); - int16x8_t a_row0 = vcombine_s16(a_tmp.val[0], a_tmp.val[1]); - a_tmp = vld2_s16(a_int16 + 2 * k); - int16x8_t a_row1 = vcombine_s16(a_tmp.val[0], a_tmp.val[1]); - - // Load 4 rows from B - int16x8_t b_vals[2] = {0}; - for (int i = 0; i < 2; ++i) { - b_vals[i] = vcombine_s16(vld1_s16(b_int16 + 4 * i * ldb), - vld1_s16(b_int16 + (4 * i + 2) * ldb)); - } - - // Separate out the real and imaginary components and reorder the columns - // of b_tmp as rows. Real components go into the low half of int16x8 - // vectors, and imaginary components to the high half - int16x8_t b_tmp[2] = {0}; - b_tmp[0] = vuzp1q_s16(b_vals[0], b_vals[1]); - b_tmp[1] = vuzp2q_s16(b_vals[0], b_vals[1]); - - b_vals[0] = vuzp1q_s16(b_tmp[0], b_tmp[1]); - b_vals[1] = vuzp2q_s16(b_tmp[0], b_tmp[1]); - - // Do the multiplication (dot-product) - // Row 1 - for (int i = 0; i < 2; ++i) { - // Real * real - real_acc[i] = vqdmlal_s16(real_acc[i], vget_low_s16(a_row0), - vget_low_s16(b_vals[i])); - // Imag * imag - real_acc[i] = vqdmlsl_high_s16(real_acc[i], a_row0, b_vals[i]); - // Real * imag - imag_acc[i] = vqdmlal_s16(imag_acc[i], vget_low_s16(a_row0), - vget_high_s16(b_vals[i])); - // Imag * real - imag_acc[i] = vqdmlal_s16(imag_acc[i], vget_high_s16(a_row0), - vget_low_s16(b_vals[i])); - } - - // Row 2 - for (int i = 0; i < 2; ++i) { - // Real * real - real_acc1[i] = vqdmlal_s16(real_acc1[i], vget_low_s16(a_row1), - vget_low_s16(b_vals[i])); - // Imag * imag - real_acc1[i] = vqdmlsl_high_s16(real_acc1[i], a_row1, b_vals[i]); - // Real * imag - imag_acc1[i] = vqdmlal_s16(imag_acc1[i], vget_low_s16(a_row1), - vget_high_s16(b_vals[i])); - // Imag * real - imag_acc1[i] = vqdmlal_s16(imag_acc1[i], vget_high_s16(a_row1), - vget_low_s16(b_vals[i])); +template +inline __attribute__((always_inline)) void cmplx_matmul_i16_32bit_mbxkx1( + const uint16_t k, const armral_cmplx_int16_t *const __restrict a, + const uint16_t lda, const armral_cmplx_int16_t *const __restrict b, + armral_cmplx_int16_t *const dst, const uint16_t ldb) { + // Just do a naive implementation for the mbxkx1 case. This is a simple loop, + // and we will trust the compiler to not do something horrible with it. + cmplx_int32_t accum[mb] = {}; + for (uint16_t mi = 0; mi < mb; ++mi) { + for (uint16_t i = 0; i < k; ++i) { + armral_cmplx_int16_t a_tmp = a[mi * lda + i]; + armral_cmplx_int16_t b_tmp = b[i * ldb]; + accum[mi].re = vqdmlalh_s16(accum[mi].re, a_tmp.re, b_tmp.re); + accum[mi].re = vqdmlslh_s16(accum[mi].re, a_tmp.im, b_tmp.im); + accum[mi].im = vqdmlalh_s16(accum[mi].im, a_tmp.re, b_tmp.im); + accum[mi].im = vqdmlalh_s16(accum[mi].im, a_tmp.im, b_tmp.re); } - - a_int16 += 8; - b_int16 += 8 * ldb; } - int32x2_t sum_32_0 = vdup_n_s32(0); - int32x2_t sum_32_1 = vdup_n_s32(0); - int32x2_t sum_32_2 = vdup_n_s32(0); - int32x2_t sum_32_3 = vdup_n_s32(0); - for (int i = 0; i < 2; ++i) { - sum_32_0[i] = vqmovnd_s64(vaddlvq_s32(real_acc[i])); - sum_32_1[i] = vqmovnd_s64(vaddlvq_s32(imag_acc[i])); - sum_32_2[i] = vqmovnd_s64(vaddlvq_s32(real_acc1[i])); - sum_32_3[i] = vqmovnd_s64(vaddlvq_s32(imag_acc1[i])); + for (uint16_t mi = 0; mi < mb; ++mi) { + // Narrow to 16 bits. + dst[mi * ldb].re = vqshrns_n_s32(accum[mi].re, 16); + dst[mi * ldb].im = vqshrns_n_s32(accum[mi].im, 16); } - - // For the last three columns of A, iterate element-by-element - for (int a_col_cnt = k & 3; a_col_cnt > 0; - --a_col_cnt, a_int16 += 2, b_int16 += 2 * ldb) { - // Load a single element from A (from two consecutive rows) - armral_cmplx_int16_t a_row0 = *((const armral_cmplx_int16_t *)a_int16); - armral_cmplx_int16_t a_row1 = - *((const armral_cmplx_int16_t *)(a_int16 + 2 * k)); - - // Load two columns from B - int16x4_t b_vals = vld1_s16(b_int16); - - for (int i = 0; i < 2; ++i) { - // Real * real - sum_32_0[i] = vqdmlalh_s16(sum_32_0[i], a_row0.re, b_vals[2 * i]); - sum_32_2[i] = vqdmlalh_s16(sum_32_2[i], a_row1.re, b_vals[2 * i]); - - // Imag * imag - sum_32_0[i] = vqdmlslh_s16(sum_32_0[i], a_row0.im, b_vals[2 * i + 1]); - sum_32_2[i] = vqdmlslh_s16(sum_32_2[i], a_row1.im, b_vals[2 * i + 1]); - - // Real * imag - sum_32_1[i] = vqdmlalh_s16(sum_32_1[i], a_row0.re, b_vals[2 * i + 1]); - sum_32_3[i] = vqdmlalh_s16(sum_32_3[i], a_row1.re, b_vals[2 * i + 1]); - - // Imag * real - sum_32_1[i] = vqdmlalh_s16(sum_32_1[i], a_row0.im, b_vals[2 * i]); - sum_32_3[i] = vqdmlalh_s16(sum_32_3[i], a_row1.im, b_vals[2 * i]); - } - } - - // Now that we have the sum in 32 bit integer in Q0.31 format, we need to get - // the result back into a Q0.15 16-bit integer. We shift to the right and - // narrow, and we do this on {re, im} pairs of numbers. - - // Zip real and imaginary components - int32x4_t out_32bit[2] = {0}; - out_32bit[0] = vcombine_s32(vzip1_s32(sum_32_0, sum_32_1), - vzip2_s32(sum_32_0, sum_32_1)); - out_32bit[1] = vcombine_s32(vzip1_s32(sum_32_2, sum_32_3), - vzip2_s32(sum_32_2, sum_32_3)); - - // Narrow to 16-bit - int16x4_t out[2] = {0}; - out[0] = vqshrn_n_s32(out_32bit[0], 16); - out[1] = vqshrn_n_s32(out_32bit[1], 16); - - // Write to the destination array - vst1_s16((int16_t *)dst, out[0]); - vst1_s16((int16_t *)(dst + ldb), out[1]); } -void armral_cmplx_matmul_i16_32bit_1xkx2( - uint16_t k, const armral_cmplx_int16_t *__restrict a, - const armral_cmplx_int16_t *__restrict b, int ldb, - armral_cmplx_int16_t *dst) { - // Performs the multiplication of a row of matrix A by a pair of columns of - // matrix B. It is assumed that B has four columns and no bounds checking is - // done. Equally, it is assumed that the pointer to A is for a row vector of - // length k. - - const int16_t *a_int16 = (const int16_t *)a; - const int16_t *b_int16 = (const int16_t *)b; - // Accumulators for the real and imaginary components of the first row - int32x4_t real_acc[2] = {0}; - int32x4_t imag_acc[2] = {0}; - - // Loop over k in blocks of 8 - for (int blk8 = k >> 3; blk8 > 0; - --blk8, a_int16 += 16, b_int16 += 16 * ldb) { - // Load 8 complex numbers from A into arrays of real and complex components - int16x8x2_t a_row = vld2q_s16(a_int16); - - // Load 8 rows from B - int16x8_t b_vals[4] = {0}; - for (int i = 0; i < 4; ++i) { - b_vals[i] = vcombine_s16(vld1_s16(b_int16 + 4 * i * ldb), - vld1_s16(b_int16 + (4 * i + 2) * ldb)); - } - - // We now want to transpose the 8x2 matrix of complex numbers, and - // de-interleave into real and complex components - int16x8_t b_tmp[4] = {0}; - for (int i = 0; i < 2; ++i) { - // Real numbers only - b_tmp[2 * i] = vuzp1q_s16(b_vals[2 * i], b_vals[2 * i + 1]); - // Imaginary numbers only - b_tmp[2 * i + 1] = vuzp2q_s16(b_vals[2 * i], b_vals[2 * i + 1]); - } - - // Even indices of b_vals hold real values, odd indices hold imaginary - // values - for (int i = 0; i < 2; ++i) { - b_vals[i] = vuzp1q_s16(b_tmp[i], b_tmp[i + 2]); - b_vals[i + 2] = vuzp2q_s16(b_tmp[i], b_tmp[i + 2]); - } +template +void cmplx_matmul_i16_32bit_mbxkxn( + const uint16_t n, const uint16_t k, + const armral_cmplx_int16_t *const __restrict p_src_a, const uint16_t lda, + const armral_cmplx_int16_t *const __restrict p_src_b, + armral_cmplx_int16_t *const dst, const uint16_t ldb) { + const armral_cmplx_int16_t *a_ptr = p_src_a; + armral_cmplx_int16_t *dst_ptr = dst; + const armral_cmplx_int16_t *b_ptr = p_src_b; - // Perform the dot product of the row of A with columns of B, expanding to - // 32 bits - for (int i = 0; i < 2; ++i) { - // Real * real - real_acc[i] = vqdmlal_s16(real_acc[i], vget_low_s16(a_row.val[0]), - vget_low_s16(b_vals[2 * i])); - real_acc[i] = vqdmlal_high_s16(real_acc[i], a_row.val[0], b_vals[2 * i]); - // Imag * imag - real_acc[i] = vqdmlsl_s16(real_acc[i], vget_low_s16(a_row.val[1]), - vget_low_s16(b_vals[2 * i + 1])); - real_acc[i] = - vqdmlsl_high_s16(real_acc[i], a_row.val[1], b_vals[2 * i + 1]); - // Real * imag - imag_acc[i] = vqdmlal_s16(imag_acc[i], vget_low_s16(a_row.val[0]), - vget_low_s16(b_vals[2 * i + 1])); - imag_acc[i] = - vqdmlal_high_s16(imag_acc[i], a_row.val[0], b_vals[2 * i + 1]); - // Imag * real - imag_acc[i] = vqdmlal_s16(imag_acc[i], vget_low_s16(a_row.val[1]), - vget_low_s16(b_vals[2 * i])); - imag_acc[i] = vqdmlal_high_s16(imag_acc[i], a_row.val[1], b_vals[2 * i]); - } + for (uint16_t b_col_cnt = n >> 3; b_col_cnt > 0; + --b_col_cnt, b_ptr += 8, dst_ptr += 8) { + cmplx_matmul_i16_32bit_mbxkxnb_even(k, a_ptr, lda, b_ptr, dst_ptr, + ldb); } - - if ((k & 4) != 0) { - // Load 4 complex numbers from A at a time - int16x4x2_t a_tmp = vld2_s16(a_int16); - int16x8_t a_row = vcombine_s16(a_tmp.val[0], a_tmp.val[1]); - - // Load 4 rows from B - int16x8_t b_vals[2] = {0}; - for (int i = 0; i < 2; ++i) { - b_vals[i] = vcombine_s16(vld1_s16(b_int16 + 4 * i * ldb), - vld1_s16(b_int16 + (4 * i + 2) * ldb)); - } - - // Separate out the real and imaginary components and reorder the columns - // of b_tmp as rows. Real components go into the low half of int16x8 - // vectors, and imaginary components to the high half - int16x8_t b_tmp[2] = {0}; - b_tmp[0] = vuzp1q_s16(b_vals[0], b_vals[1]); - b_tmp[1] = vuzp2q_s16(b_vals[0], b_vals[1]); - - b_vals[0] = vuzp1q_s16(b_tmp[0], b_tmp[1]); - b_vals[1] = vuzp2q_s16(b_tmp[0], b_tmp[1]); - - // Do the multiplication (dot-product) - for (int i = 0; i < 2; ++i) { - // Real * real - real_acc[i] = vqdmlal_s16(real_acc[i], vget_low_s16(a_row), - vget_low_s16(b_vals[i])); - // Imag * imag - real_acc[i] = vqdmlsl_high_s16(real_acc[i], a_row, b_vals[i]); - // Real * imag - imag_acc[i] = vqdmlal_s16(imag_acc[i], vget_low_s16(a_row), - vget_high_s16(b_vals[i])); - // Imag * real - imag_acc[i] = vqdmlal_s16(imag_acc[i], vget_high_s16(a_row), - vget_low_s16(b_vals[i])); - } - - a_int16 += 8; - b_int16 += 8 * ldb; + if ((n & 4) != 0) { + cmplx_matmul_i16_32bit_mbxkxnb_even(k, a_ptr, lda, b_ptr, dst_ptr, + ldb); + b_ptr += 4; + dst_ptr += 4; } - - int32x2_t sum_32_0 = vdup_n_s32(0); - int32x2_t sum_32_1 = vdup_n_s32(0); - for (int i = 0; i < 2; ++i) { - sum_32_0[i] = vqmovnd_s64(vaddlvq_s32(real_acc[i])); - sum_32_1[i] = vqmovnd_s64(vaddlvq_s32(imag_acc[i])); + if ((n & 2) != 0) { + cmplx_matmul_i16_32bit_mbxkxnb_even(k, a_ptr, lda, b_ptr, dst_ptr, + ldb); + b_ptr += 2; + dst_ptr += 2; } - - // For the last three columns of A, iterate element-by-element - for (int a_col_cnt = k & 3; a_col_cnt > 0; - --a_col_cnt, a_int16 += 2, b_int16 += 2 * ldb) { - // Load a single element from A (from two consecutive rows) - armral_cmplx_int16_t a_row = *((const armral_cmplx_int16_t *)a_int16); - - // Load two columns from B - int16x4_t b_vals = vld1_s16(b_int16); - - for (int i = 0; i < 2; ++i) { - // Real * real - sum_32_0[i] = vqdmlalh_s16(sum_32_0[i], a_row.re, b_vals[2 * i]); - // Imag * imag - sum_32_0[i] = vqdmlslh_s16(sum_32_0[i], a_row.im, b_vals[2 * i + 1]); - // Real * imag - sum_32_1[i] = vqdmlalh_s16(sum_32_1[i], a_row.re, b_vals[2 * i + 1]); - // Imag * real - sum_32_1[i] = vqdmlalh_s16(sum_32_1[i], a_row.im, b_vals[2 * i]); - } + if ((n & 1) != 0) { + cmplx_matmul_i16_32bit_mbxkx1(k, a_ptr, lda, b_ptr, dst_ptr, ldb); } - - // Now that we have the sum in 32 bit integer in Q0.31 format, we need to get - // the result back into a Q0.15 16-bit integer. We shift to the right and - // narrow, and we do this on {re, im} pairs of numbers. - - // Zip real and imaginary components - int32x4_t out_32bit = vcombine_s32(vzip1_s32(sum_32_0, sum_32_1), - vzip2_s32(sum_32_0, sum_32_1)); - - // Narrow to 16-bit - int16x4_t out = vqshrn_n_s32(out_32bit, 16); - - // Write to the destination array - vst1_s16((int16_t *)dst, out); } -void armral_cmplx_matmul_i16_32bit_2xkx1( - uint16_t k, const armral_cmplx_int16_t *__restrict a, - const armral_cmplx_int16_t *__restrict b, int ldb, - armral_cmplx_int16_t *dst) { - // Performs the multiplication of a row of matrix A by a single column of - // matrix B. It is assumed that B has four columns and no bounds checking is - // done. Equally, it is assumed that the pointer to A is for a row vector of - // length k. - - const int16_t *a_int16 = (const int16_t *)a; - const int16_t *b_int16 = (const int16_t *)b; - // Accumulators for the real and imaginary components of the first row - int32x4_t real_acc = {0}; - int32x4_t imag_acc = {0}; - // Accumulators for the real and imaginary components of the second row - int32x4_t real_acc1 = {0}; - int32x4_t imag_acc1 = {0}; - - // Loop over k in blocks of 8 - for (int blk8 = k >> 3; blk8 > 0; - --blk8, a_int16 += 16, b_int16 += 16 * ldb) { - // Load 8 complex numbers from A into arrays of real and complex components - int16x8x2_t a_row0 = vld2q_s16(a_int16); - int16x8x2_t a_row1 = vld2q_s16(a_int16 + 2 * k); - - // Load 8 rows from B, and store real and complex parts separately - int16x8x2_t b_vals = {0}; - b_vals = vld2q_lane_s16(b_int16, b_vals, 0); - b_vals = vld2q_lane_s16(b_int16 + 2 * ldb, b_vals, 1); - b_vals = vld2q_lane_s16(b_int16 + 4 * ldb, b_vals, 2); - b_vals = vld2q_lane_s16(b_int16 + 6 * ldb, b_vals, 3); - b_vals = vld2q_lane_s16(b_int16 + 8 * ldb, b_vals, 4); - b_vals = vld2q_lane_s16(b_int16 + 10 * ldb, b_vals, 5); - b_vals = vld2q_lane_s16(b_int16 + 12 * ldb, b_vals, 6); - b_vals = vld2q_lane_s16(b_int16 + 14 * ldb, b_vals, 7); - - // Perform the dot product of the row of A with columns of B, expanding to - // 32 bits Row 1 real * real - real_acc = vqdmlal_s16(real_acc, vget_low_s16(a_row0.val[0]), - vget_low_s16(b_vals.val[0])); - real_acc = vqdmlal_high_s16(real_acc, a_row0.val[0], b_vals.val[0]); - // Imag * imag - real_acc = vqdmlsl_s16(real_acc, vget_low_s16(a_row0.val[1]), - vget_low_s16(b_vals.val[1])); - real_acc = vqdmlsl_high_s16(real_acc, a_row0.val[1], b_vals.val[1]); - // Real * imag - imag_acc = vqdmlal_s16(imag_acc, vget_low_s16(a_row0.val[0]), - vget_low_s16(b_vals.val[1])); - imag_acc = vqdmlal_high_s16(imag_acc, a_row0.val[0], b_vals.val[1]); - // Imag * real - imag_acc = vqdmlal_s16(imag_acc, vget_low_s16(a_row0.val[1]), - vget_low_s16(b_vals.val[0])); - imag_acc = vqdmlal_high_s16(imag_acc, a_row0.val[1], b_vals.val[0]); - - // Row 2 - // Real * real - real_acc1 = vqdmlal_s16(real_acc1, vget_low_s16(a_row1.val[0]), - vget_low_s16(b_vals.val[0])); - real_acc1 = vqdmlal_high_s16(real_acc1, a_row1.val[0], b_vals.val[0]); - // Imag * imag - real_acc1 = vqdmlsl_s16(real_acc1, vget_low_s16(a_row1.val[1]), - vget_low_s16(b_vals.val[1])); - real_acc1 = vqdmlsl_high_s16(real_acc1, a_row1.val[1], b_vals.val[1]); - // Real * imag - imag_acc1 = vqdmlal_s16(imag_acc1, vget_low_s16(a_row1.val[0]), - vget_low_s16(b_vals.val[1])); - imag_acc1 = vqdmlal_high_s16(imag_acc1, a_row1.val[0], b_vals.val[1]); - // Imag * real - imag_acc1 = vqdmlal_s16(imag_acc1, vget_low_s16(a_row1.val[1]), - vget_low_s16(b_vals.val[0])); - imag_acc1 = vqdmlal_high_s16(imag_acc1, a_row1.val[1], b_vals.val[0]); - } - - if ((k & 4) != 0) { - // Load 4 complex numbers from A at a time - int16x4x2_t a_tmp = vld2_s16(a_int16); - int16x8_t a_row0 = vcombine_s16(a_tmp.val[0], a_tmp.val[1]); - a_tmp = vld2_s16(a_int16 + 2 * k); - int16x8_t a_row1 = vcombine_s16(a_tmp.val[0], a_tmp.val[1]); - - // Load 4 rows from B - int16x4x2_t b_tmp = {0}; - b_tmp = vld2_lane_s16(b_int16, b_tmp, 0); - b_tmp = vld2_lane_s16(b_int16 + 2 * ldb, b_tmp, 1); - b_tmp = vld2_lane_s16(b_int16 + 4 * ldb, b_tmp, 2); - b_tmp = vld2_lane_s16(b_int16 + 6 * ldb, b_tmp, 3); - int16x8_t b_vals = vcombine_s16(b_tmp.val[0], b_tmp.val[1]); - - // Do the multiplication (dot-product) - // Row 1 - // Real * real - real_acc = - vqdmlal_s16(real_acc, vget_low_s16(a_row0), vget_low_s16(b_vals)); - // Imag * imag - real_acc = vqdmlsl_high_s16(real_acc, a_row0, b_vals); - // Real * imag - imag_acc = - vqdmlal_s16(imag_acc, vget_low_s16(a_row0), vget_high_s16(b_vals)); - // Imag * real - imag_acc = - vqdmlal_s16(imag_acc, vget_high_s16(a_row0), vget_low_s16(b_vals)); - - // Row 2 - // Real * real - real_acc1 = - vqdmlal_s16(real_acc1, vget_low_s16(a_row1), vget_low_s16(b_vals)); - // Imag * imag - real_acc1 = vqdmlsl_high_s16(real_acc1, a_row1, b_vals); - // Real * imag - imag_acc1 = - vqdmlal_s16(imag_acc1, vget_low_s16(a_row1), vget_high_s16(b_vals)); - // Imag * real - imag_acc1 = - vqdmlal_s16(imag_acc1, vget_high_s16(a_row1), vget_low_s16(b_vals)); - - a_int16 += 8; - b_int16 += 8 * ldb; - } - - cmplx_int32_t sum_32[2]; - sum_32[0].re = vqmovnd_s64(vaddlvq_s32(real_acc)); - sum_32[0].im = vqmovnd_s64(vaddlvq_s32(imag_acc)); - sum_32[1].re = vqmovnd_s64(vaddlvq_s32(real_acc1)); - sum_32[1].im = vqmovnd_s64(vaddlvq_s32(imag_acc1)); - - // For the last three columns of A, iterate element-by-element - for (int a_col_cnt = k & 3; a_col_cnt > 0; - --a_col_cnt, a_int16 += 2, b_int16 += 2 * ldb) { - // Load a single element from A (from two consecutive rows) - armral_cmplx_int16_t a_row0 = *((const armral_cmplx_int16_t *)a_int16); - armral_cmplx_int16_t a_row1 = - *((const armral_cmplx_int16_t *)(a_int16 + 2 * k)); - - // Load one value from B - armral_cmplx_int16_t b_val = *((const armral_cmplx_int16_t *)b_int16); +template +armral_status cmplx_matmul_i16_32bit_block_m( + const uint16_t m, const uint16_t n, const uint16_t k, + const armral_cmplx_int16_t *const __restrict p_src_a, const uint16_t lda, + const armral_cmplx_int16_t *const __restrict p_src_b, + armral_cmplx_int16_t *const dst, const uint16_t ldb) { - // Real * real - sum_32[0].re = vqdmlalh_s16(sum_32[0].re, a_row0.re, b_val.re); - sum_32[1].re = vqdmlalh_s16(sum_32[1].re, a_row1.re, b_val.re); + uint16_t mbi = 0; - // Imag * imag - sum_32[0].re = vqdmlslh_s16(sum_32[0].re, a_row0.im, b_val.im); - sum_32[1].re = vqdmlslh_s16(sum_32[1].re, a_row1.im, b_val.im); - - // Real * imag - sum_32[0].im = vqdmlalh_s16(sum_32[0].im, a_row0.re, b_val.im); - sum_32[1].im = vqdmlalh_s16(sum_32[1].im, a_row1.re, b_val.im); - - // Imag * real - sum_32[0].im = vqdmlalh_s16(sum_32[0].im, a_row0.im, b_val.re); - sum_32[1].im = vqdmlalh_s16(sum_32[1].im, a_row1.im, b_val.re); + for (; mbi < m - mb + 1; mbi += mb) { + cmplx_matmul_i16_32bit_mbxkxn(n, k, p_src_a + mbi * lda, lda, p_src_b, + dst + mbi * ldb, ldb); } - - // Now that we have the sum in 32 bit integer in Q0.31 format, we need to get - // the result back into a Q0.15 16-bit integer. We shift to the right. - dst[0].re = vqshrns_n_s32(sum_32[0].re, 16); - dst[0].im = vqshrns_n_s32(sum_32[0].im, 16); - dst[ldb].re = vqshrns_n_s32(sum_32[1].re, 16); - dst[ldb].im = vqshrns_n_s32(sum_32[1].im, 16); -} - -inline __attribute__((always_inline)) void -armral_cmplx_matmul_i16_32bit_1xkx1(uint16_t k, - const armral_cmplx_int16_t *__restrict a, - const armral_cmplx_int16_t *__restrict b, - int ldb, armral_cmplx_int16_t *dst) { - // Just do a naive implementation for the 1xkx1 case. This is a simple loop, - // and we will trust the compiler to not do something horrible with it. - cmplx_int32_t accum = {}; - for (int i = 0; i < k; ++i) { - armral_cmplx_int16_t a_tmp = a[i]; - armral_cmplx_int16_t b_tmp = b[i * ldb]; - accum.re = vqdmlalh_s16(accum.re, a_tmp.re, b_tmp.re); - accum.re = vqdmlslh_s16(accum.re, a_tmp.im, b_tmp.im); - accum.im = vqdmlalh_s16(accum.im, a_tmp.re, b_tmp.im); - accum.im = vqdmlalh_s16(accum.im, a_tmp.im, b_tmp.re); + for (; mbi < m; mbi++) { // remainder + cmplx_matmul_i16_32bit_mbxkxn<1>(n, k, p_src_a + mbi * lda, lda, p_src_b, + dst + mbi * ldb, ldb); } - - // Narrow to 16 bits. - dst->re = vqshrns_n_s32(accum.re, 16); - dst->im = vqshrns_n_s32(accum.im, 16); + return ARMRAL_SUCCESS; } armral_status @@ -1005,51 +304,61 @@ cmplx_matmul_i16_32bit(const uint16_t m, const uint16_t n, const uint16_t k, const armral_cmplx_int16_t *__restrict p_src_a, const armral_cmplx_int16_t *__restrict p_src_b, armral_cmplx_int16_t *p_dst) { - // Loop over two rows of A at a time - const armral_cmplx_int16_t *a_ptr = p_src_a; - armral_cmplx_int16_t *out_ptr = p_dst; - for (uint16_t a_row_cnt = m >> 1; a_row_cnt > 0; - --a_row_cnt, a_ptr += 2 * k, out_ptr += 2 * n) { - armral_cmplx_int16_t *out_row_ptr = out_ptr; - // Loop over four columns of B - const armral_cmplx_int16_t *b_ptr = p_src_b; - for (uint16_t b_col_cnt = n >> 2; b_col_cnt > 0; - --b_col_cnt, b_ptr += 4, out_row_ptr += 4) { - armral_cmplx_matmul_i16_32bit_2xkx4(k, a_ptr, b_ptr, n, out_row_ptr); - } - // If there are two or more columns left in B, unroll by two columns - if ((n & 2) != 0) { - armral_cmplx_matmul_i16_32bit_2xkx2(k, a_ptr, b_ptr, n, out_row_ptr); - b_ptr += 2; - out_row_ptr += 2; - } - // Deal with a tail, if there is one in the columns of B - if ((n & 1) != 0) { - armral_cmplx_matmul_i16_32bit_2xkx1(k, a_ptr, b_ptr, n, out_row_ptr); - } - } - - if ((m & 1) != 0) { - armral_cmplx_int16_t *out_row_ptr = out_ptr; - // Loop over four columns of B - const armral_cmplx_int16_t *b_ptr = p_src_b; - for (uint16_t b_col_cnt = n >> 2; b_col_cnt > 0; - --b_col_cnt, b_ptr += 4, out_row_ptr += 4) { - armral_cmplx_matmul_i16_32bit_1xkx4(k, a_ptr, b_ptr, n, out_row_ptr); - } - - if ((n & 2) != 0) { - armral_cmplx_matmul_i16_32bit_1xkx2(k, a_ptr, b_ptr, n, out_row_ptr); - b_ptr += 2; - out_row_ptr += 2; - } - - if ((n & 1) != 0) { - armral_cmplx_matmul_i16_32bit_1xkx1(k, a_ptr, b_ptr, n, out_row_ptr); - } +#if ARMRAL_ARCH_SVE >= 2 + if (m > 2 && n > 2) { + svbool_t pb; + uint16_t i = 0; + uint16_t j = 0; + for (; i < (m - 1); i += 2) { + j = 0; + pb = svptrue_pat_b16(SV_VL8); + for (; j < (n - 3); j += 4) { + cmplx_matmul_i16_32bit_ixkxj_sve<2>(pb, m, n, k, &p_src_a[i * k], + &p_src_b[j], &p_dst[i * n + j]); + } + + if (j < (n - 1)) { + cmplx_matmul_i16_32bit_ixkxj_sve<2>(svptrue_pat_b16(SV_VL4), m, n, k, + &p_src_a[i * k], &p_src_b[j], + &p_dst[i * n + j]); + j += 2; + } + // If n is odd, we have one more row/col to go + if (n % 2 != 0) { + j = n - 1; + cmplx_matmul_i16_32bit_ixkxj_sve<2>(svptrue_pat_b16(SV_VL2), m, n, k, + &p_src_a[i * k], &p_src_b[j], + &p_dst[i * n + j]); + } + } + + if (m % 2 != 0) { + i = m - 1; + j = 0; + pb = svptrue_pat_b16(SV_VL8); + for (; j < (n - 3); j += 4) { + cmplx_matmul_i16_32bit_ixkxj_sve<1>(pb, m, n, k, &p_src_a[i * k], + &p_src_b[j], &p_dst[i * n + j]); + } + if (j < (n - 1)) { + cmplx_matmul_i16_32bit_ixkxj_sve<1>(svptrue_pat_b16(SV_VL4), m, n, k, + &p_src_a[i * k], &p_src_b[j], + &p_dst[i * n + j]); + j += 2; + } + // If n is odd, we have one more row/col to go + if (n % 2 != 0) { + j = n - 1; + cmplx_matmul_i16_32bit_ixkxj_sve<1>(svptrue_pat_b16(SV_VL2), m, n, k, + &p_src_a[i * k], &p_src_b[j], + &p_dst[i * n + j]); + } + } + return ARMRAL_SUCCESS; } - - return ARMRAL_SUCCESS; +#endif + return cmplx_matmul_i16_32bit_block_m<2>(m, n, k, p_src_a, k, p_src_b, p_dst, + n); } } // anonymous namespace diff --git a/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp b/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp index a8ec7c3c32d1e7f725600cfb09e30b6e7191d83d..41547d190d185971d50b3e05515201fb1750d353 100644 --- a/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp +++ b/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp @@ -114,8 +114,8 @@ cmplx_pseudo_inverse_direct(uint16_t m, uint16_t n, const float32_t lambda, return ARMRAL_SUCCESS; } - // If the number of rows in the input matrix is less than or equal to the number - // of columns then use the right pseudo-inverse + // If the number of rows in the input matrix is less than or equal to the + // number of columns then use the right pseudo-inverse switch (m) { case 1: { right_pseudo_inverse<1>(n, lambda, p_src, p_dst, allocator); diff --git a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c index 45498f7747681fe0591a8b20fa3258e131ad287f..ee71134a30f3f346e0bc35f29f2aadf0e7089077 100644 --- a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c +++ b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c @@ -13,13 +13,13 @@ armral_cmplx_vecdot_f32(uint32_t n, const armral_cmplx_f32_t *restrict p_src_a, const armral_cmplx_f32_t *restrict p_src_b, armral_cmplx_f32_t *p_src_c) { #ifdef ARMRAL_ARCH_SVE - uint32_t num_lanes = svcntd(); + int64_t num_lanes = svcntd(); svbool_t ptrue = svptrue_b32(); svfloat32_t acc0 = svdup_n_f32(0); svfloat32_t acc1 = svdup_n_f32(0); - uint32_t i = 0; - for (; (i + 2) * num_lanes <= n; i += 2) { + int64_t i = 0; + for (; (i + 2) * num_lanes <= (int64_t)n; i += 2) { svbool_t pg = svptrue_b32(); svfloat32_t vec_a0 = svld1_vnum_f32(pg, (const float32_t *)p_src_a, i); svfloat32_t vec_b0 = svld1_vnum_f32(pg, (const float32_t *)p_src_b, i); @@ -32,8 +32,8 @@ armral_cmplx_vecdot_f32(uint32_t n, const armral_cmplx_f32_t *restrict p_src_a, acc1 = svcmla_f32_m(pg, acc1, vec_a1, vec_b1, 90); } - for (; i * num_lanes < n; ++i) { - svbool_t pg = svwhilelt_b32(2 * i * num_lanes, 2 * n); + for (; i * num_lanes < (int64_t)n; ++i) { + svbool_t pg = svwhilelt_b32(2 * i * num_lanes, 2 * (int64_t)n); svfloat32_t vec_a = svld1_vnum_f32(pg, (const float32_t *)p_src_a, i); svfloat32_t vec_b = svld1_vnum_f32(pg, (const float32_t *)p_src_b, i); @@ -58,12 +58,9 @@ armral_cmplx_vecdot_f32(uint32_t n, const armral_cmplx_f32_t *restrict p_src_a, float32x4x2_t vec2; float32x4x2_t vec3; float32x4x2_t vec4; - float32x4_t acc_r; - float32x4_t acc_i; - float32x2_t accum = vdup_n_f32(0); - acc_r = vdupq_n_f32(0.0); - acc_i = vdupq_n_f32(0.0); + float32x4_t acc_r = vdupq_n_f32(0.0); + float32x4_t acc_i = vdupq_n_f32(0.0); /* Loop unrolling: Compute 8 outputs at a time */ blk_cnt = n >> 3U; @@ -106,7 +103,7 @@ armral_cmplx_vecdot_f32(uint32_t n, const armral_cmplx_f32_t *restrict p_src_a, blk_cnt--; } - accum = vpadd_f32(vget_low_f32(acc_r), vget_high_f32(acc_r)); + float32x2_t accum = vpadd_f32(vget_low_f32(acc_r), vget_high_f32(acc_r)); real_sum += accum[0] + accum[1]; accum = vpadd_f32(vget_low_f32(acc_i), vget_high_f32(acc_i)); diff --git a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c index 3fb08351cffb57f435660bba01f262a7d9b90a72..fa9b224bbd7501d865cf4f89f0580ff2a22d42f7 100644 --- a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c +++ b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c @@ -76,12 +76,9 @@ armral_status armral_cmplx_vecdot_f32_2(uint32_t n, float32x4_t vec_a_im[2]; float32x4_t vec_b_re[2]; float32x4_t vec_b_im[2]; - float32x4_t acc_r; - float32x4_t acc_i; - float32x2_t accum = vdup_n_f32(0); - acc_r = vdupq_n_f32(0.0); - acc_i = vdupq_n_f32(0.0); + float32x4_t acc_r = vdupq_n_f32(0.0); + float32x4_t acc_i = vdupq_n_f32(0.0); /* Loop unrolling: Compute 8 outputs at a time */ blk_cnt = n >> 3U; @@ -121,7 +118,7 @@ armral_status armral_cmplx_vecdot_f32_2(uint32_t n, blk_cnt--; } - accum = vpadd_f32(vget_low_f32(acc_r), vget_high_f32(acc_r)); + float32x2_t accum = vpadd_f32(vget_low_f32(acc_r), vget_high_f32(acc_r)); real_sum += accum[0] + accum[1]; accum = vpadd_f32(vget_low_f32(acc_i), vget_high_f32(acc_i)); diff --git a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c index 32d88d6960cfa0c5e541675538b80409f178468a..7fad68349e8256964189fa664969327fdb4768f3 100644 --- a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c +++ b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c @@ -56,11 +56,9 @@ armral_cmplx_vecdot_i16(uint32_t n, int16x4x2_t vec2; int32x4x2_t vec1_ext; int32x4x2_t vec2_ext; - int64x2_t acc_r; - int64x2_t acc_i; - acc_r = vdupq_n_s64(0); - acc_i = vdupq_n_s64(0); + int64x2_t acc_r = vdupq_n_s64(0); + int64x2_t acc_i = vdupq_n_s64(0); /* Compute 8 outputs at a time */ blk_cnt = n >> 2U; diff --git a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c index af4450445dd61d0f537684d130c94d50ac55a465..ed3a538106f6520ad4aa4d5d4a4abb34634622be 100644 --- a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c +++ b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c @@ -88,11 +88,9 @@ armral_status armral_cmplx_vecdot_i16_2(uint32_t n, int32x4_t vec_a_im_ext; int32x4_t vec_b_re_ext; int32x4_t vec_b_im_ext; - int64x2_t acc_r; - int64x2_t acc_i; - acc_r = vdupq_n_s64(0); - acc_i = vdupq_n_s64(0); + int64x2_t acc_r = vdupq_n_s64(0); + int64x2_t acc_i = vdupq_n_s64(0); /* Compute 4 outputs at a time */ blk_cnt = n >> 2U; diff --git a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c index 0bff5db6a5e177117ba621d56e008414089e1e71..fb1a218eb0c87f60d55bf9bd6d4b98fe7b5bf8b2 100644 --- a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c +++ b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c @@ -14,11 +14,11 @@ armral_status armral_cmplx_vecmul_f32_2(uint32_t n, const float32_t *restrict b_im, float32_t *c_re, float32_t *c_im) { #ifdef ARMRAL_ARCH_SVE - uint32_t num_lanes = svcntw(); + int64_t num_lanes = svcntw(); svbool_t pg = svptrue_b32(); - uint32_t i = 0; - for (; (i + 4) * num_lanes <= n; i += 4) { + int64_t i = 0; + for (; (i + 4) * num_lanes <= (int64_t)n; i += 4) { svfloat32_t vec_a_0_re = svld1_vnum_f32(pg, a_re, i); svfloat32_t vec_a_0_im = svld1_vnum_f32(pg, a_im, i); svfloat32_t vec_b_0_re = svld1_vnum_f32(pg, b_re, i); @@ -55,7 +55,7 @@ armral_status armral_cmplx_vecmul_f32_2(uint32_t n, vec_c_2_im = svmla_f32_x(pg, vec_c_2_im, vec_a_2_im, vec_b_2_re); vec_c_3_re = svmls_f32_x(pg, vec_c_3_re, vec_a_3_im, vec_b_3_im); vec_c_3_im = svmla_f32_x(pg, vec_c_3_im, vec_a_3_im, vec_b_3_re); - asm volatile(""); + svst1_vnum_f32(pg, c_re, i + 0, vec_c_0_re); svst1_vnum_f32(pg, c_im, i + 0, vec_c_0_im); svst1_vnum_f32(pg, c_re, i + 1, vec_c_1_re); @@ -66,8 +66,8 @@ armral_status armral_cmplx_vecmul_f32_2(uint32_t n, svst1_vnum_f32(pg, c_im, i + 3, vec_c_3_im); } - for (; i * num_lanes < n; i++) { - pg = svwhilelt_b32(i * num_lanes, n); + for (; i * num_lanes < (int64_t)n; i++) { + pg = svwhilelt_b32(i * num_lanes, (int64_t)n); svfloat32_t vec_a_re = svld1_vnum_f32(pg, a_re, i); svfloat32_t vec_a_im = svld1_vnum_f32(pg, a_im, i); svfloat32_t vec_b_re = svld1_vnum_f32(pg, b_re, i); diff --git a/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp b/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp index 76d179ffb250a673354342471107428f52edd5f5..c30b4e1fc16d7af06e7767cf7908785bde24b3ed 100644 --- a/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp +++ b/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp @@ -520,14 +520,12 @@ armral_status armral_mu_law_compr_8bit(uint32_t n_prb, if (scale != nullptr) { prb_in = load3_cmplx_and_scale((const int16_t *)src, *scale); - src += 12; } else { prb_in.val[0] = vld1q_s16((const int16_t *)src); src += 4; prb_in.val[1] = vld1q_s16((const int16_t *)src); src += 4; prb_in.val[2] = vld1q_s16((const int16_t *)src); - src += 4; } // Extract the sign bit and absolute values for the PRB @@ -978,14 +976,12 @@ armral_status armral_mu_law_compr_9bit(uint32_t n_prb, int16x8x3_t prb_in; if (scale != nullptr) { prb_in = load3_cmplx_and_scale((const int16_t *)src, *scale); - src += 12; } else { prb_in.val[0] = vld1q_s16((const int16_t *)src); src += 4; prb_in.val[1] = vld1q_s16((const int16_t *)src); src += 4; prb_in.val[2] = vld1q_s16((const int16_t *)src); - src += 4; } // Extract the sign bit and absolute values for the PRB @@ -1531,14 +1527,12 @@ armral_status armral_mu_law_compr_14bit(uint32_t n_prb, int16x8x3_t prb_in; if (scale != nullptr) { prb_in = load3_cmplx_and_scale((const int16_t *)src, *scale); - src += 12; } else { prb_in.val[0] = vld1q_s16((const int16_t *)src); src += 4; prb_in.val[1] = vld1q_s16((const int16_t *)src); src += 4; prb_in.val[2] = vld1q_s16((const int16_t *)src); - src += 4; } // Extract the sign bit and absolute values for the PRB diff --git a/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp b/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp index 835a1081041ab86b99e7dfefe924ca0eedaa1214..be7e41ebbd66001c0f3552a03a5019bcb96769c5 100644 --- a/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp +++ b/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp @@ -217,9 +217,6 @@ armral_status armral_block_float_compr_8bit(uint32_t n_prb, shifted_output = vshlq_s16(in.val[2], exp_vec); compressed_output = vmovn_s16(shifted_output); vst1_s8(data_out, compressed_output); - - /*Next compressed struct*/ - dst++; } return ARMRAL_SUCCESS; #endif @@ -322,7 +319,6 @@ armral_status armral_block_float_compr_9bit(uint32_t n_prb, pack_9bit_and_store_int16<1>(reg, dst); - dst++; data_in += 24; } return ARMRAL_SUCCESS; @@ -742,7 +738,6 @@ armral_block_float_compr_14bit(uint32_t n_prb, const armral_cmplx_int16_t *src, dst->exp = exps[0]; pack_14bit_and_store_int16<1, true>(reg, dst, exps); - dst++; } return ARMRAL_SUCCESS; #endif diff --git a/src/LowerPHY/FFT/bluestein.cpp b/src/LowerPHY/FFT/bluestein.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c35dd112f34390b04b5726dccc2eabd5f4a2e82e --- /dev/null +++ b/src/LowerPHY/FFT/bluestein.cpp @@ -0,0 +1,235 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +*/ + +#include "bluestein.hpp" +#include "fft_execute.hpp" + +#include + +namespace armral::fft { + +template +bluestein make_bluestein(int n, armral_fft_direction_t dir, + const int *base_kernels, + int len_base_kernels) { + using real_t = armral::fft::real_t; + + // Look for the next size > 2n-1 which would allow us to use fast kernels alone + int n_pad = 2 * n - 1; + n_pad--; + int curn_n = 0; + do { + curn_n = ++n_pad; + for (int i = 0; i < len_base_kernels; i++) { + if (!base_kernels[i]) { + continue; + } + while (curn_n % i == 0) { + curn_n /= i; + } + } + } while (curn_n != 1); + + Tw *a = static_cast(malloc(n_pad * sizeof(Tw))); + Tw *b = static_cast(calloc(n_pad, sizeof(Tw))); + + // Populate the two vectors to be used in the convolution, a and b. + for (int i = 0; i < n; i++) { + auto c = (real_t)cos(M_PI * (real_t)i * (real_t)i / (real_t)n); + auto s = (real_t)sin(M_PI * (real_t)i * (real_t)i / (real_t)n); + a[i] = {c, (real_t)-s}; // { cos(-pi*i*i/n), sin(-pi*i*i/n) } + b[i] = {c, s}; // { cos( pi*i*i/n), sin( pi*i*i/n) } + } + + // Make sure b is periodic + int j = n_pad - n + 1; + for (int i = n - 1; i > 0; i--) { + b[j++] = b[i]; + } + + // Create 2 plans: forward and backward + armral_fft_plan_t *pf = nullptr; + armral_fft_plan_t *pb = nullptr; + armral::fft::create_plan( + &pf, n_pad, armral_fft_direction_t::ARMRAL_FFT_FORWARDS, false); + armral::fft::create_plan( + &pb, n_pad, armral_fft_direction_t::ARMRAL_FFT_BACKWARDS, false); + + // Execute fwds plan transforming series b + armral::fft::execute(pf, b, b, 1, 1, 1); + + // Multiply output from FFT of b with 1/n_pad + real_t recip_npad = 1.0 / n_pad; + for (int i = 0; i < n_pad; i++) { + b[i] = {b[i].re * recip_npad, b[i].im * recip_npad}; + } + + return bluestein{n, n_pad, dir, pf, pb, a, b}; +} + +template bluestein +make_bluestein(int n, armral_fft_direction_t dir, const int *base_kernels, + int len_base_kernels); +template bluestein +make_bluestein(int n, armral_fft_direction_t dir, const int *base_kernels, + int len_base_kernels); +template bluestein +make_bluestein(int n, armral_fft_direction_t dir, const int *base_kernels, + int len_base_kernels); +template bluestein +make_bluestein(int n, armral_fft_direction_t dir, const int *base_kernels, + int len_base_kernels); + +template struct bluestein; +template struct bluestein; +template struct bluestein; +template struct bluestein; + +template +static inline void multiply_a_x(Tw *work_ptr, const Tx *x, const Tw *a, int n, + int n_pad, int istride) { + for (int i = 0; i < n; i++) { + Tw xi = armral::fft::cast(x[i * istride]); + work_ptr[i].re = a[i].re * xi.re - a[i].im * xi.im; + work_ptr[i].im = a[i].re * xi.im + a[i].im * xi.re; + } + for (int i = n; i < n_pad; i++) { + work_ptr[i] = {0, 0}; + } +} + +template +static inline void multiply_a_x_dit(Tw *work_ptr, const Tx *x, const Tw *a, + int n, int n_pad, int istride, + const Tw *w) { + Tw xi = armral::fft::cast(x[0]); + work_ptr[0].re = a[0].re * xi.re - a[0].im * xi.im; + work_ptr[0].im = a[0].re * xi.im + a[0].im * xi.re; + for (int i = 1; i < n; i++) { + xi = armral::fft::cast(x[i * istride]); + Tw wi = w[(i - 1) * 2]; + Tw tmp = {xi.re * wi.re - xi.im * wi.im, xi.re * wi.im + xi.im * wi.re}; + work_ptr[i] = {a[i].re * tmp.re - a[i].im * tmp.im, + a[i].re * tmp.im + a[i].im * tmp.re}; + } + for (int i = n; i < n_pad; i++) { + work_ptr[i] = {0, 0}; + } +} + +template +static inline void multiply_y_a(const Tw *work_ptr, Ty *y, const Tw *a, int n, + int ostride, armral_fft_direction_t dir) { + Tw tmp = {work_ptr[0].re * a[0].re - work_ptr[0].im * a[0].im, + work_ptr[0].re * a[0].im + work_ptr[0].im * a[0].re}; + y[0] = armral::fft::cast(tmp); + if (dir == armral_fft_direction_t::ARMRAL_FFT_BACKWARDS) { + y = y + n * ostride; + ostride = -ostride; + } + for (int i = 1; i < n; i++) { + tmp.re = work_ptr[i].re * a[i].re - work_ptr[i].im * a[i].im; + tmp.im = work_ptr[i].re * a[i].im + work_ptr[i].im * a[i].re; + y[i * ostride] = armral::fft::cast(tmp); + } +} + +template +void execute_bluestein(const bluestein &bs, const Tx *x, Ty *y, + int istride, int ostride, const Tw *w, int howmany, + int idist, int odist) { + + Tw *work_ptr = static_cast(malloc(bs.n_pad * sizeof(Tw))); + + for (int i = 0; i < howmany; i++) { + // Multiply input by a and store in work + if (w != NULL && i > 0) { + multiply_a_x_dit(work_ptr, &x[i * idist], bs.a, bs.n, bs.n_pad, istride, + &w[2 * (bs.n - 1) * (i - 1)]); + } else { + multiply_a_x(work_ptr, &x[i * idist], bs.a, bs.n, bs.n_pad, istride); + } + + armral::fft::execute(bs.pf, work_ptr, work_ptr, 1, 1, 1); + for (int j = 0; j < bs.n_pad; j++) { + Tw tmp = {work_ptr[j].re * bs.b[j].re - work_ptr[j].im * bs.b[j].im, + work_ptr[j].re * bs.b[j].im + work_ptr[j].im * bs.b[j].re}; + work_ptr[j] = tmp; + } + + armral::fft::execute(bs.pb, work_ptr, work_ptr, 1, 1, 1); + + // Multiply by a and store in output vector y + multiply_y_a(work_ptr, &y[i * odist], bs.a, bs.n, ostride, bs.dir); + } + free(work_ptr); +} + +template void +execute_bluestein( + const bluestein + &bs, + const armral_cmplx_f32_t *x, armral_cmplx_f32_t *y, int istride, + int ostride, const armral_cmplx_f32_t *w, int howmany, int idist, + int odist); +template void execute_bluestein( + const bluestein &bs, + const armral_cmplx_int16_t *x, armral_cmplx_int16_t *y, int istride, + int ostride, const armral_cmplx_f32_t *w, int howmany, int idist, + int odist); +template void +execute_bluestein( + const bluestein &bs, + const armral_cmplx_int16_t *x, armral_cmplx_f32_t *y, int istride, + int ostride, const armral_cmplx_f32_t *w, int howmany, int idist, + int odist); +template void +execute_bluestein( + const bluestein &bs, + const armral_cmplx_f32_t *x, armral_cmplx_int16_t *y, int istride, + int ostride, const armral_cmplx_f32_t *w, int howmany, int idist, + int odist); + +template +bluestein::~bluestein() { + if (pf != nullptr) { + armral::fft::destroy_plan(&pf); + free(pf); + pf = nullptr; + } + if (pb != nullptr) { + armral::fft::destroy_plan(&pb); + free(pb); + pb = nullptr; + } + if (a) { + free(const_cast(a)); + a = nullptr; + } + if (b) { + free(const_cast(b)); + b = nullptr; + } +} + +template bluestein::~bluestein(); +template bluestein::~bluestein(); +template bluestein::~bluestein(); +template bluestein::~bluestein(); + +} // end namespace armral::fft diff --git a/src/LowerPHY/FFT/bluestein.hpp b/src/LowerPHY/FFT/bluestein.hpp new file mode 100644 index 0000000000000000000000000000000000000000..15811ef0b64203c7f93d092151433e3e943dddcc --- /dev/null +++ b/src/LowerPHY/FFT/bluestein.hpp @@ -0,0 +1,66 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +*/ + +#pragma once + +#include "fft_plan.hpp" + +namespace armral::fft { + +/// Class to support using Bluestein's algorithm for prime n. +template +struct bluestein { + int n; + int n_pad; + armral_fft_direction_t dir; + + armral_fft_plan_t *pf; + armral_fft_plan_t *pb; + + const Tw *a; + const Tw *b; + + bluestein() + : n(0), n_pad(0), dir((armral_fft_direction_t)0), pf(nullptr), pb(nullptr), + a(nullptr), b(nullptr) {} + + bluestein(const bluestein &) = delete; + + bluestein(bluestein &&other) + : n(other.n), n_pad(other.n_pad), dir(other.dir), pf(other.pf), + pb(other.pb), a(other.a), b(other.b) { + other.a = nullptr; + other.b = nullptr; + other.pf = nullptr; + other.pb = nullptr; + } + + bluestein &operator=(const bluestein &) = delete; + bluestein &operator=(bluestein &&) = delete; + + bluestein(int n_in, int n_pad_in, armral_fft_direction_t dir_in, + armral_fft_plan_t *pf_in, armral_fft_plan_t *pb_in, Tw *a_in, + Tw *b_in) + : n(n_in), n_pad(n_pad_in), dir(dir_in), pf(pf_in), pb(pb_in), a(a_in), + b(b_in) {} + + ~bluestein(); + + operator bool() const { + return n != 0; + } +}; + +template +bluestein make_bluestein(int n, armral_fft_direction_t dir, + const int *base_kernels, + int len_base_kernels); + +template +void execute_bluestein(const bluestein &bs, const Tx *x, Ty *y, + int istride, int ostride, const Tw *w, int howmany, + int idist, int odist); + +} // namespace armral::fft diff --git a/src/LowerPHY/FFT/fft_cf32.cpp b/src/LowerPHY/FFT/fft_cf32.cpp index 28cbe6f9514998eb6ab29ac8cd99715593af6255..6917fd21ed7d76e70ac88caf573f3b95fc57003a 100644 --- a/src/LowerPHY/FFT/fft_cf32.cpp +++ b/src/LowerPHY/FFT/fft_cf32.cpp @@ -8,7 +8,7 @@ armral_status armral_fft_create_plan_cf32(armral_fft_plan_t **p, int n, armral_fft_direction_t dir) { return armral::fft::create_plan(p, n, dir); + armral_cmplx_f32_t>(p, n, dir, true); } armral_status armral_fft_execute_cf32(const armral_fft_plan_t *p, diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c index 84ba9f8e11c2522d20c4e4f7046c24b194d1f1af..237b077fa7950c58577d7ee69e1c068b35e9627c 100644 --- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c +++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c @@ -23,12 +23,12 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu2(const armral_cmplx_f32_t *restrict x, float32x2_t *v138 = &v6[ostride]; const float32x2_t *v110 = &v5[0]; float32x2_t *v129 = &v6[0]; - float32x4_t v144 = *(const float32x4_t *)v119; - float32x4_t v142 = *(const float32x4_t *)v110; + float32x4_t v144 = vld1q_f32((const float32_t *)v119); + float32x4_t v142 = vld1q_f32((const float32_t *)v110); float32x4_t v35 = vaddq_f32(v142, v144); float32x4_t v36 = vsubq_f32(v142, v144); - *(float32x4_t *)v129 = v35; - *(float32x4_t *)v138 = v36; + vst1q_f32((float32_t *)v129, v35); + vst1q_f32((float32_t *)v138, v36); v5 += 2 * 1; v6 += 2 * 1; } @@ -101,14 +101,14 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu3(const armral_cmplx_f32_t *restrict x, float32x2_t v57 = (float32x2_t){v55, v56}; const float32x2_t *v175 = &v5[0]; float32x2_t *v185 = &v6[0]; - float32x4_t v207 = *(const float32x4_t *)v156; + float32x4_t v207 = vld1q_f32((const float32_t *)v156); float32x4_t v53 = vcombine_f32(v52, v52); float32x2_t v59 = vmul_f32(v58, v57); const float32x2_t *v165 = &v5[istride * 2]; float32x2_t *v203 = &v6[ostride * 2]; - float32x4_t v211 = *(const float32x4_t *)v175; + float32x4_t v211 = vld1q_f32((const float32_t *)v175); float32x4_t v61 = vcombine_f32(v59, v59); - float32x4_t v209 = *(const float32x4_t *)v165; + float32x4_t v209 = vld1q_f32((const float32_t *)v165); float32x4_t v35 = vaddq_f32(v207, v209); float32x4_t v36 = vsubq_f32(v207, v209); float32x4_t v44 = vaddq_f32(v35, v211); @@ -116,11 +116,11 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu3(const armral_cmplx_f32_t *restrict x, float32x4_t v60 = vrev64q_f32(v36); float32x4_t v62 = vmulq_f32(v60, v61); float32x4_t v63 = vaddq_f32(v44, v54); - *(float32x4_t *)v185 = v44; + vst1q_f32((float32_t *)v185, v44); float32x4_t v64 = vaddq_f32(v63, v62); float32x4_t v65 = vsubq_f32(v63, v62); - *(float32x4_t *)v194 = v65; - *(float32x4_t *)v203 = v64; + vst1q_f32((float32_t *)v194, v65); + vst1q_f32((float32_t *)v203, v64); v5 += 2 * 1; v6 += 2 * 1; } @@ -230,16 +230,16 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu4(const armral_cmplx_f32_t *restrict x, float32x2_t v72 = (float32x2_t){v70, v71}; const float32x2_t *v194 = &v5[0]; float32x2_t *v231 = &v6[0]; - float32x4_t v266 = *(const float32x4_t *)v212; + float32x4_t v266 = vld1q_f32((const float32_t *)v212); float32x2_t v74 = vmul_f32(v73, v72); const float32x2_t *v203 = &v5[istride * 2]; const float32x2_t *v221 = &v5[istride * 3]; float32x2_t *v249 = &v6[ostride * 2]; float32x2_t *v258 = &v6[ostride * 3]; - float32x4_t v262 = *(const float32x4_t *)v194; + float32x4_t v262 = vld1q_f32((const float32_t *)v194); float32x4_t v76 = vcombine_f32(v74, v74); - float32x4_t v264 = *(const float32x4_t *)v203; - float32x4_t v268 = *(const float32x4_t *)v221; + float32x4_t v264 = vld1q_f32((const float32_t *)v203); + float32x4_t v268 = vld1q_f32((const float32_t *)v221); float32x4_t v35 = vaddq_f32(v262, v264); float32x4_t v36 = vsubq_f32(v262, v264); float32x4_t v51 = vaddq_f32(v266, v268); @@ -248,12 +248,12 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu4(const armral_cmplx_f32_t *restrict x, float32x4_t v54 = vsubq_f32(v35, v51); float32x4_t v75 = vrev64q_f32(v52); float32x4_t v77 = vmulq_f32(v75, v76); - *(float32x4_t *)v231 = v53; - *(float32x4_t *)v249 = v54; + vst1q_f32((float32_t *)v231, v53); + vst1q_f32((float32_t *)v249, v54); float32x4_t v78 = vaddq_f32(v36, v77); float32x4_t v79 = vsubq_f32(v36, v77); - *(float32x4_t *)v240 = v79; - *(float32x4_t *)v258 = v78; + vst1q_f32((float32_t *)v240, v79); + vst1q_f32((float32_t *)v258, v78); v5 += 2 * 1; v6 += 2 * 1; } @@ -384,7 +384,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu5(const armral_cmplx_f32_t *restrict x, float32x2_t v97 = (float32x2_t){v95, v96}; const float32x2_t *v302 = &v5[0]; float32x2_t *v312 = &v6[0]; - float32x4_t v352 = *(const float32x4_t *)v265; + float32x4_t v352 = vld1q_f32((const float32_t *)v265); float32x4_t v72 = vcombine_f32(v71, v71); float32x4_t v77 = vcombine_f32(v76, v76); float32x2_t v83 = vmul_f32(v98, v81); @@ -396,13 +396,13 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu5(const armral_cmplx_f32_t *restrict x, float32x2_t *v330 = &v6[ostride * 2]; float32x2_t *v339 = &v6[ostride * 3]; float32x2_t *v348 = &v6[ostride * 4]; - float32x4_t v360 = *(const float32x4_t *)v302; + float32x4_t v360 = vld1q_f32((const float32_t *)v302); float32x4_t v85 = vcombine_f32(v83, v83); float32x4_t v93 = vcombine_f32(v91, v91); float32x4_t v101 = vcombine_f32(v99, v99); - float32x4_t v354 = *(const float32x4_t *)v274; - float32x4_t v356 = *(const float32x4_t *)v283; - float32x4_t v358 = *(const float32x4_t *)v292; + float32x4_t v354 = vld1q_f32((const float32_t *)v274); + float32x4_t v356 = vld1q_f32((const float32_t *)v283); + float32x4_t v358 = vld1q_f32((const float32_t *)v292); float32x4_t v35 = vaddq_f32(v352, v354); float32x4_t v36 = vsubq_f32(v352, v354); float32x4_t v51 = vaddq_f32(v356, v358); @@ -420,7 +420,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu5(const armral_cmplx_f32_t *restrict x, float32x4_t v102 = vmulq_f32(v100, v101); float32x4_t v94 = vmulq_f32(v92, v93); float32x4_t v103 = vaddq_f32(v63, v73); - *(float32x4_t *)v312 = v63; + vst1q_f32((float32_t *)v312, v63); float32x4_t v104 = vaddq_f32(v103, v78); float32x4_t v105 = vsubq_f32(v103, v78); float32x4_t v106 = vsubq_f32(v86, v94); @@ -429,10 +429,10 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu5(const armral_cmplx_f32_t *restrict x, float32x4_t v109 = vsubq_f32(v104, v106); float32x4_t v110 = vaddq_f32(v105, v107); float32x4_t v111 = vsubq_f32(v105, v107); - *(float32x4_t *)v321 = v109; - *(float32x4_t *)v330 = v111; - *(float32x4_t *)v339 = v110; - *(float32x4_t *)v348 = v108; + vst1q_f32((float32_t *)v321, v109); + vst1q_f32((float32_t *)v330, v111); + vst1q_f32((float32_t *)v339, v110); + vst1q_f32((float32_t *)v348, v108); v5 += 2 * 1; v6 += 2 * 1; } @@ -618,7 +618,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu6(const armral_cmplx_f32_t *restrict x, float32x2_t v108 = (float32x2_t){v106, v107}; const float32x2_t *v286 = &v5[0]; float32x2_t *v341 = &v6[0]; - float32x4_t v400 = *(const float32x4_t *)v331; + float32x4_t v400 = vld1q_f32((const float32_t *)v331); float32x4_t v104 = vcombine_f32(v103, v103); float32x2_t v110 = vmul_f32(v109, v108); const float32x2_t *v295 = &v5[istride * 3]; @@ -629,12 +629,12 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu6(const armral_cmplx_f32_t *restrict x, float32x2_t *v359 = &v6[ostride * 4]; float32x2_t *v377 = &v6[ostride * 2]; float32x2_t *v386 = &v6[ostride * 5]; - float32x4_t v390 = *(const float32x4_t *)v286; + float32x4_t v390 = vld1q_f32((const float32_t *)v286); float32x4_t v112 = vcombine_f32(v110, v110); - float32x4_t v392 = *(const float32x4_t *)v295; - float32x4_t v394 = *(const float32x4_t *)v304; - float32x4_t v396 = *(const float32x4_t *)v313; - float32x4_t v398 = *(const float32x4_t *)v322; + float32x4_t v392 = vld1q_f32((const float32_t *)v295); + float32x4_t v394 = vld1q_f32((const float32_t *)v304); + float32x4_t v396 = vld1q_f32((const float32_t *)v313); + float32x4_t v398 = vld1q_f32((const float32_t *)v322); float32x4_t v35 = vaddq_f32(v390, v392); float32x4_t v36 = vsubq_f32(v390, v392); float32x4_t v51 = vaddq_f32(v394, v396); @@ -655,16 +655,16 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu6(const armral_cmplx_f32_t *restrict x, float32x4_t v90 = vaddq_f32(v71, v81); float32x4_t v113 = vmulq_f32(v111, v112); float32x4_t v114 = vaddq_f32(v95, v105); - *(float32x4_t *)v341 = v71; - *(float32x4_t *)v350 = v95; + vst1q_f32((float32_t *)v341, v71); + vst1q_f32((float32_t *)v350, v95); float32x4_t v91 = vaddq_f32(v90, v89); float32x4_t v92 = vsubq_f32(v90, v89); float32x4_t v115 = vaddq_f32(v114, v113); float32x4_t v116 = vsubq_f32(v114, v113); - *(float32x4_t *)v359 = v92; - *(float32x4_t *)v368 = v116; - *(float32x4_t *)v377 = v91; - *(float32x4_t *)v386 = v115; + vst1q_f32((float32_t *)v359, v92); + vst1q_f32((float32_t *)v368, v116); + vst1q_f32((float32_t *)v377, v91); + vst1q_f32((float32_t *)v386, v115); v5 += 2 * 1; v6 += 2 * 1; } @@ -859,7 +859,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu7(const armral_cmplx_f32_t *restrict x, float32x2_t v138 = (float32x2_t){v136, v137}; const float32x2_t *v439 = &v5[0]; float32x2_t *v449 = &v6[0]; - float32x4_t v507 = *(const float32x4_t *)v384; + float32x4_t v507 = vld1q_f32((const float32_t *)v384); float32x4_t v95 = vcombine_f32(v94, v94); float32x4_t v100 = vcombine_f32(v99, v99); float32x4_t v105 = vcombine_f32(v104, v104); @@ -878,16 +878,16 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu7(const armral_cmplx_f32_t *restrict x, float32x2_t *v485 = &v6[ostride * 4]; float32x2_t *v494 = &v6[ostride * 5]; float32x2_t *v503 = &v6[ostride * 6]; - float32x4_t v519 = *(const float32x4_t *)v439; + float32x4_t v519 = vld1q_f32((const float32_t *)v439); float32x4_t v118 = vcombine_f32(v116, v116); float32x4_t v126 = vcombine_f32(v124, v124); float32x4_t v134 = vcombine_f32(v132, v132); float32x4_t v142 = vcombine_f32(v140, v140); - float32x4_t v509 = *(const float32x4_t *)v393; - float32x4_t v511 = *(const float32x4_t *)v402; - float32x4_t v513 = *(const float32x4_t *)v411; - float32x4_t v515 = *(const float32x4_t *)v420; - float32x4_t v517 = *(const float32x4_t *)v429; + float32x4_t v509 = vld1q_f32((const float32_t *)v393); + float32x4_t v511 = vld1q_f32((const float32_t *)v402); + float32x4_t v513 = vld1q_f32((const float32_t *)v411); + float32x4_t v515 = vld1q_f32((const float32_t *)v420); + float32x4_t v517 = vld1q_f32((const float32_t *)v429); float32x4_t v35 = vaddq_f32(v507, v509); float32x4_t v36 = vsubq_f32(v507, v509); float32x4_t v51 = vaddq_f32(v511, v513); @@ -918,7 +918,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu7(const armral_cmplx_f32_t *restrict x, float32x4_t v143 = vmulq_f32(v141, v142); float32x4_t v119 = vmulq_f32(v117, v118); float32x4_t v144 = vaddq_f32(v78, v96); - *(float32x4_t *)v449 = v78; + vst1q_f32((float32_t *)v449, v78); float32x4_t v145 = vaddq_f32(v144, v101); float32x4_t v147 = vsubq_f32(v144, v101); float32x4_t v149 = vsubq_f32(v144, v106); @@ -937,12 +937,12 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu7(const armral_cmplx_f32_t *restrict x, float32x4_t v160 = vsubq_f32(v148, v154); float32x4_t v161 = vaddq_f32(v150, v156); float32x4_t v162 = vsubq_f32(v150, v156); - *(float32x4_t *)v458 = v158; - *(float32x4_t *)v467 = v160; - *(float32x4_t *)v476 = v161; - *(float32x4_t *)v485 = v162; - *(float32x4_t *)v494 = v159; - *(float32x4_t *)v503 = v157; + vst1q_f32((float32_t *)v458, v158); + vst1q_f32((float32_t *)v467, v160); + vst1q_f32((float32_t *)v476, v161); + vst1q_f32((float32_t *)v485, v162); + vst1q_f32((float32_t *)v494, v159); + vst1q_f32((float32_t *)v503, v157); v5 += 2 * 1; v6 += 2 * 1; } @@ -1227,7 +1227,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu8(const armral_cmplx_f32_t *restrict x, float32x2_t v139 = (float32x2_t){v138, v138}; const float32x2_t *v374 = &v5[0]; float32x2_t *v447 = &v6[0]; - float32x4_t v522 = *(const float32x4_t *)v410; + float32x4_t v522 = vld1q_f32((const float32_t *)v410); float32x2_t v125 = vmul_f32(v132, v123); float32x2_t v133 = vmul_f32(v132, v131); float32x4_t v140 = vcombine_f32(v139, v139); @@ -1243,15 +1243,15 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu8(const armral_cmplx_f32_t *restrict x, float32x2_t *v492 = &v6[ostride * 5]; float32x2_t *v501 = &v6[ostride * 6]; float32x2_t *v510 = &v6[ostride * 7]; - float32x4_t v514 = *(const float32x4_t *)v374; + float32x4_t v514 = vld1q_f32((const float32_t *)v374); float32x4_t v127 = vcombine_f32(v125, v125); float32x4_t v135 = vcombine_f32(v133, v133); - float32x4_t v516 = *(const float32x4_t *)v383; - float32x4_t v518 = *(const float32x4_t *)v392; - float32x4_t v520 = *(const float32x4_t *)v401; - float32x4_t v524 = *(const float32x4_t *)v419; - float32x4_t v526 = *(const float32x4_t *)v428; - float32x4_t v528 = *(const float32x4_t *)v437; + float32x4_t v516 = vld1q_f32((const float32_t *)v383); + float32x4_t v518 = vld1q_f32((const float32_t *)v392); + float32x4_t v520 = vld1q_f32((const float32_t *)v401); + float32x4_t v524 = vld1q_f32((const float32_t *)v419); + float32x4_t v526 = vld1q_f32((const float32_t *)v428); + float32x4_t v528 = vld1q_f32((const float32_t *)v437); float32x4_t v35 = vaddq_f32(v514, v516); float32x4_t v36 = vsubq_f32(v514, v516); float32x4_t v51 = vaddq_f32(v518, v520); @@ -1277,8 +1277,8 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu8(const armral_cmplx_f32_t *restrict x, float32x4_t v136 = vmulq_f32(v134, v135); float32x4_t v144 = vaddq_f32(v36, v141); float32x4_t v145 = vsubq_f32(v36, v141); - *(float32x4_t *)v447 = v89; - *(float32x4_t *)v483 = v90; + vst1q_f32((float32_t *)v447, v89); + vst1q_f32((float32_t *)v483, v90); float32x4_t v142 = vaddq_f32(v86, v115); float32x4_t v143 = vsubq_f32(v86, v115); float32x4_t v146 = vaddq_f32(v128, v136); @@ -1287,12 +1287,12 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu8(const armral_cmplx_f32_t *restrict x, float32x4_t v149 = vsubq_f32(v144, v146); float32x4_t v150 = vaddq_f32(v145, v147); float32x4_t v151 = vsubq_f32(v145, v147); - *(float32x4_t *)v465 = v143; - *(float32x4_t *)v501 = v142; - *(float32x4_t *)v456 = v149; - *(float32x4_t *)v474 = v150; - *(float32x4_t *)v492 = v151; - *(float32x4_t *)v510 = v148; + vst1q_f32((float32_t *)v465, v143); + vst1q_f32((float32_t *)v501, v142); + vst1q_f32((float32_t *)v456, v149); + vst1q_f32((float32_t *)v474, v150); + vst1q_f32((float32_t *)v492, v151); + vst1q_f32((float32_t *)v510, v148); v5 += 2 * 1; v6 += 2 * 1; } @@ -1541,7 +1541,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu9(const armral_cmplx_f32_t *restrict x, float32x2_t v168 = (float32x2_t){v166, v167}; const float32x2_t *v547 = &v5[0]; float32x2_t *v557 = &v6[0]; - float32x4_t v633 = *(const float32x4_t *)v474; + float32x4_t v633 = vld1q_f32((const float32_t *)v474); float32x4_t v112 = vcombine_f32(v111, v111); float32x4_t v125 = vcombine_f32(v124, v124); float32x2_t v131 = vmul_f32(v169, v129); @@ -1565,18 +1565,18 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu9(const armral_cmplx_f32_t *restrict x, float32x2_t *v611 = &v6[ostride * 6]; float32x2_t *v620 = &v6[ostride * 7]; float32x2_t *v629 = &v6[ostride * 8]; - float32x4_t v649 = *(const float32x4_t *)v547; + float32x4_t v649 = vld1q_f32((const float32_t *)v547); float32x4_t v133 = vcombine_f32(v131, v131); float32x4_t v156 = vcombine_f32(v154, v154); float32x4_t v164 = vcombine_f32(v162, v162); float32x4_t v172 = vcombine_f32(v170, v170); - float32x4_t v635 = *(const float32x4_t *)v483; - float32x4_t v637 = *(const float32x4_t *)v492; - float32x4_t v639 = *(const float32x4_t *)v501; - float32x4_t v641 = *(const float32x4_t *)v510; - float32x4_t v643 = *(const float32x4_t *)v519; - float32x4_t v645 = *(const float32x4_t *)v528; - float32x4_t v647 = *(const float32x4_t *)v537; + float32x4_t v635 = vld1q_f32((const float32_t *)v483); + float32x4_t v637 = vld1q_f32((const float32_t *)v492); + float32x4_t v639 = vld1q_f32((const float32_t *)v501); + float32x4_t v641 = vld1q_f32((const float32_t *)v510); + float32x4_t v643 = vld1q_f32((const float32_t *)v519); + float32x4_t v645 = vld1q_f32((const float32_t *)v528); + float32x4_t v647 = vld1q_f32((const float32_t *)v537); float32x4_t v35 = vaddq_f32(v633, v635); float32x4_t v36 = vsubq_f32(v633, v635); float32x4_t v51 = vaddq_f32(v637, v639); @@ -1621,7 +1621,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu9(const armral_cmplx_f32_t *restrict x, float32x4_t v188 = vaddq_f32(v187, v165); float32x4_t v190 = vaddq_f32(v189, v173); float32x4_t v192 = vsubq_f32(v191, v173); - *(float32x4_t *)v557 = v95; + vst1q_f32((float32_t *)v557, v95); float32x4_t v176 = vaddq_f32(v95, v175); float32x4_t v180 = vaddq_f32(v179, v174); float32x4_t v177 = vaddq_f32(v176, v121); @@ -1632,20 +1632,20 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu9(const armral_cmplx_f32_t *restrict x, float32x4_t v182 = vaddq_f32(v181, v144); float32x4_t v184 = vaddq_f32(v183, v149); float32x4_t v186 = vsubq_f32(v185, v149); - *(float32x4_t *)v584 = v178; - *(float32x4_t *)v611 = v177; + vst1q_f32((float32_t *)v584, v178); + vst1q_f32((float32_t *)v611, v177); float32x4_t v193 = vaddq_f32(v182, v188); float32x4_t v194 = vsubq_f32(v182, v188); float32x4_t v195 = vaddq_f32(v184, v190); float32x4_t v196 = vsubq_f32(v184, v190); float32x4_t v197 = vaddq_f32(v186, v192); float32x4_t v198 = vsubq_f32(v186, v192); - *(float32x4_t *)v566 = v194; - *(float32x4_t *)v575 = v195; - *(float32x4_t *)v593 = v198; - *(float32x4_t *)v602 = v197; - *(float32x4_t *)v620 = v196; - *(float32x4_t *)v629 = v193; + vst1q_f32((float32_t *)v566, v194); + vst1q_f32((float32_t *)v575, v195); + vst1q_f32((float32_t *)v593, v198); + vst1q_f32((float32_t *)v602, v197); + vst1q_f32((float32_t *)v620, v196); + vst1q_f32((float32_t *)v629, v193); v5 += 2 * 1; v6 += 2 * 1; } @@ -1992,7 +1992,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu10(const armral_cmplx_f32_t *restrict x, float32x2_t v198 = (float32x2_t){v196, v197}; const float32x2_t *v512 = &v5[0]; float32x2_t *v603 = &v6[0]; - float32x4_t v702 = *(const float32x4_t *)v575; + float32x4_t v702 = vld1q_f32((const float32_t *)v575); float32x4_t v173 = vcombine_f32(v172, v172); float32x4_t v178 = vcombine_f32(v177, v177); float32x2_t v184 = vmul_f32(v199, v182); @@ -2014,18 +2014,18 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu10(const armral_cmplx_f32_t *restrict x, float32x2_t *v666 = &v6[ostride * 3]; float32x2_t *v675 = &v6[ostride * 4]; float32x2_t *v684 = &v6[ostride * 9]; - float32x4_t v688 = *(const float32x4_t *)v512; + float32x4_t v688 = vld1q_f32((const float32_t *)v512); float32x4_t v186 = vcombine_f32(v184, v184); float32x4_t v194 = vcombine_f32(v192, v192); float32x4_t v202 = vcombine_f32(v200, v200); - float32x4_t v690 = *(const float32x4_t *)v521; - float32x4_t v692 = *(const float32x4_t *)v530; - float32x4_t v694 = *(const float32x4_t *)v539; - float32x4_t v696 = *(const float32x4_t *)v548; - float32x4_t v698 = *(const float32x4_t *)v557; - float32x4_t v700 = *(const float32x4_t *)v566; - float32x4_t v704 = *(const float32x4_t *)v584; - float32x4_t v706 = *(const float32x4_t *)v593; + float32x4_t v690 = vld1q_f32((const float32_t *)v521); + float32x4_t v692 = vld1q_f32((const float32_t *)v530); + float32x4_t v694 = vld1q_f32((const float32_t *)v539); + float32x4_t v696 = vld1q_f32((const float32_t *)v548); + float32x4_t v698 = vld1q_f32((const float32_t *)v557); + float32x4_t v700 = vld1q_f32((const float32_t *)v566); + float32x4_t v704 = vld1q_f32((const float32_t *)v584); + float32x4_t v706 = vld1q_f32((const float32_t *)v593); float32x4_t v35 = vaddq_f32(v688, v690); float32x4_t v36 = vsubq_f32(v688, v690); float32x4_t v51 = vaddq_f32(v692, v694); @@ -2070,8 +2070,8 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu10(const armral_cmplx_f32_t *restrict x, float32x4_t v148 = vaddq_f32(v108, v118); float32x4_t v195 = vmulq_f32(v193, v194); float32x4_t v204 = vaddq_f32(v164, v174); - *(float32x4_t *)v603 = v108; - *(float32x4_t *)v612 = v164; + vst1q_f32((float32_t *)v603, v108); + vst1q_f32((float32_t *)v612, v164); float32x4_t v149 = vaddq_f32(v148, v123); float32x4_t v150 = vsubq_f32(v148, v123); float32x4_t v151 = vsubq_f32(v131, v139); @@ -2088,14 +2088,14 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu10(const armral_cmplx_f32_t *restrict x, float32x4_t v210 = vsubq_f32(v205, v207); float32x4_t v211 = vaddq_f32(v206, v208); float32x4_t v212 = vsubq_f32(v206, v208); - *(float32x4_t *)v621 = v154; - *(float32x4_t *)v630 = v210; - *(float32x4_t *)v639 = v156; - *(float32x4_t *)v648 = v212; - *(float32x4_t *)v657 = v155; - *(float32x4_t *)v666 = v211; - *(float32x4_t *)v675 = v153; - *(float32x4_t *)v684 = v209; + vst1q_f32((float32_t *)v621, v154); + vst1q_f32((float32_t *)v630, v210); + vst1q_f32((float32_t *)v639, v156); + vst1q_f32((float32_t *)v648, v212); + vst1q_f32((float32_t *)v657, v155); + vst1q_f32((float32_t *)v666, v211); + vst1q_f32((float32_t *)v675, v153); + vst1q_f32((float32_t *)v684, v209); v5 += 2 * 1; v6 += 2 * 1; } @@ -2462,7 +2462,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu11(const armral_cmplx_f32_t *restrict x, float32x2_t v264 = (float32x2_t){v262, v263}; const float32x2_t *v811 = &v5[0]; float32x2_t *v821 = &v6[0]; - float32x4_t v915 = *(const float32x4_t *)v720; + float32x4_t v915 = vld1q_f32((const float32_t *)v720); float32x4_t v143 = vcombine_f32(v142, v142); float32x2_t v149 = vmul_f32(v265, v147); float32x4_t v156 = vcombine_f32(v155, v155); @@ -2501,7 +2501,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu11(const armral_cmplx_f32_t *restrict x, float32x2_t *v884 = &v6[ostride * 4]; float32x2_t *v893 = &v6[ostride * 3]; float32x2_t *v902 = &v6[ostride * 2]; - float32x4_t v935 = *(const float32x4_t *)v811; + float32x4_t v935 = vld1q_f32((const float32_t *)v811); float32x4_t v151 = vcombine_f32(v149, v149); float32x4_t v204 = vcombine_f32(v202, v202); float32x4_t v212 = vcombine_f32(v210, v210); @@ -2512,15 +2512,15 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu11(const armral_cmplx_f32_t *restrict x, float32x4_t v252 = vcombine_f32(v250, v250); float32x4_t v260 = vcombine_f32(v258, v258); float32x4_t v268 = vcombine_f32(v266, v266); - float32x4_t v917 = *(const float32x4_t *)v729; - float32x4_t v919 = *(const float32x4_t *)v738; - float32x4_t v921 = *(const float32x4_t *)v747; - float32x4_t v923 = *(const float32x4_t *)v756; - float32x4_t v925 = *(const float32x4_t *)v765; - float32x4_t v927 = *(const float32x4_t *)v774; - float32x4_t v929 = *(const float32x4_t *)v783; - float32x4_t v931 = *(const float32x4_t *)v792; - float32x4_t v933 = *(const float32x4_t *)v801; + float32x4_t v917 = vld1q_f32((const float32_t *)v729); + float32x4_t v919 = vld1q_f32((const float32_t *)v738); + float32x4_t v921 = vld1q_f32((const float32_t *)v747); + float32x4_t v923 = vld1q_f32((const float32_t *)v756); + float32x4_t v925 = vld1q_f32((const float32_t *)v765); + float32x4_t v927 = vld1q_f32((const float32_t *)v774); + float32x4_t v929 = vld1q_f32((const float32_t *)v783); + float32x4_t v931 = vld1q_f32((const float32_t *)v792); + float32x4_t v933 = vld1q_f32((const float32_t *)v801); float32x4_t v35 = vaddq_f32(v915, v917); float32x4_t v50 = vaddq_f32(v919, v921); float32x4_t v65 = vaddq_f32(v923, v925); @@ -2605,7 +2605,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu11(const armral_cmplx_f32_t *restrict x, float32x4_t v284 = vsubq_f32(v253, v269); float32x4_t v285 = vaddq_f32(v275, v277); float32x4_t v303 = vaddq_f32(v279, v280); - *(float32x4_t *)v821 = v114; + vst1q_f32((float32_t *)v821, v114); float32x4_t v286 = vaddq_f32(v285, v270); float32x4_t v287 = vsubq_f32(v270, v272); float32x4_t v289 = vaddq_f32(v270, v276); @@ -2636,16 +2636,16 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu11(const armral_cmplx_f32_t *restrict x, float32x4_t v315 = vsubq_f32(v286, v296); float32x4_t v307 = vaddq_f32(v294, v306); float32x4_t v316 = vsubq_f32(v294, v306); - *(float32x4_t *)v839 = v308; - *(float32x4_t *)v848 = v309; - *(float32x4_t *)v857 = v310; - *(float32x4_t *)v866 = v311; - *(float32x4_t *)v875 = v312; - *(float32x4_t *)v884 = v313; - *(float32x4_t *)v893 = v314; - *(float32x4_t *)v902 = v315; - *(float32x4_t *)v830 = v307; - *(float32x4_t *)v911 = v316; + vst1q_f32((float32_t *)v839, v308); + vst1q_f32((float32_t *)v848, v309); + vst1q_f32((float32_t *)v857, v310); + vst1q_f32((float32_t *)v866, v311); + vst1q_f32((float32_t *)v875, v312); + vst1q_f32((float32_t *)v884, v313); + vst1q_f32((float32_t *)v893, v314); + vst1q_f32((float32_t *)v902, v315); + vst1q_f32((float32_t *)v830, v307); + vst1q_f32((float32_t *)v911, v316); v5 += 2 * 1; v6 += 2 * 1; } @@ -3212,7 +3212,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu12(const armral_cmplx_f32_t *restrict x, float32x2_t v211 = (float32x2_t){v210, v210}; const float32x2_t *v580 = &v5[0]; float32x2_t *v671 = &v6[0]; - float32x4_t v792 = *(const float32x4_t *)v643; + float32x4_t v792 = vld1q_f32((const float32_t *)v643); float32x2_t v142 = vmul_f32(v204, v140); float32x4_t v167 = vcombine_f32(v166, v166); float32x2_t v173 = vmul_f32(v204, v171); @@ -3238,20 +3238,20 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu12(const armral_cmplx_f32_t *restrict x, float32x2_t *v752 = &v6[ostride * 3]; float32x2_t *v761 = &v6[ostride * 7]; float32x2_t *v770 = &v6[ostride * 11]; - float32x4_t v778 = *(const float32x4_t *)v580; + float32x4_t v778 = vld1q_f32((const float32_t *)v580); float32x4_t v144 = vcombine_f32(v142, v142); float32x4_t v175 = vcombine_f32(v173, v173); float32x4_t v207 = vcombine_f32(v205, v205); - float32x4_t v774 = *(const float32x4_t *)v561; - float32x4_t v776 = *(const float32x4_t *)v570; - float32x4_t v780 = *(const float32x4_t *)v589; - float32x4_t v782 = *(const float32x4_t *)v598; - float32x4_t v784 = *(const float32x4_t *)v607; - float32x4_t v786 = *(const float32x4_t *)v616; - float32x4_t v788 = *(const float32x4_t *)v625; - float32x4_t v790 = *(const float32x4_t *)v634; - float32x4_t v794 = *(const float32x4_t *)v652; - float32x4_t v796 = *(const float32x4_t *)v661; + float32x4_t v774 = vld1q_f32((const float32_t *)v561); + float32x4_t v776 = vld1q_f32((const float32_t *)v570); + float32x4_t v780 = vld1q_f32((const float32_t *)v589); + float32x4_t v782 = vld1q_f32((const float32_t *)v598); + float32x4_t v784 = vld1q_f32((const float32_t *)v607); + float32x4_t v786 = vld1q_f32((const float32_t *)v616); + float32x4_t v788 = vld1q_f32((const float32_t *)v625); + float32x4_t v790 = vld1q_f32((const float32_t *)v634); + float32x4_t v794 = vld1q_f32((const float32_t *)v652); + float32x4_t v796 = vld1q_f32((const float32_t *)v661); float32x4_t v35 = vaddq_f32(v774, v776); float32x4_t v36 = vsubq_f32(v774, v776); float32x4_t v59 = vaddq_f32(v780, v782); @@ -3302,8 +3302,8 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu12(const armral_cmplx_f32_t *restrict x, float32x4_t v215 = vsubq_f32(v208, v213); float32x4_t v216 = vaddq_f32(v121, v158); float32x4_t v264 = vaddq_f32(v122, v163); - *(float32x4_t *)v671 = v121; - *(float32x4_t *)v725 = v122; + vst1q_f32((float32_t *)v671, v121); + vst1q_f32((float32_t *)v725, v122); float32x4_t v146 = vaddq_f32(v118, v145); float32x4_t v147 = vsubq_f32(v118, v145); float32x4_t v217 = vaddq_f32(v216, v192); @@ -3312,20 +3312,20 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu12(const armral_cmplx_f32_t *restrict x, float32x4_t v266 = vsubq_f32(v264, v200); float32x4_t v240 = vaddq_f32(v147, v178); float32x4_t v288 = vaddq_f32(v146, v177); - *(float32x4_t *)v680 = v218; - *(float32x4_t *)v689 = v217; - *(float32x4_t *)v698 = v147; - *(float32x4_t *)v734 = v266; - *(float32x4_t *)v743 = v265; - *(float32x4_t *)v752 = v146; + vst1q_f32((float32_t *)v680, v218); + vst1q_f32((float32_t *)v689, v217); + vst1q_f32((float32_t *)v698, v147); + vst1q_f32((float32_t *)v734, v266); + vst1q_f32((float32_t *)v743, v265); + vst1q_f32((float32_t *)v752, v146); float32x4_t v241 = vaddq_f32(v240, v215); float32x4_t v242 = vsubq_f32(v240, v215); float32x4_t v289 = vaddq_f32(v288, v214); float32x4_t v290 = vsubq_f32(v288, v214); - *(float32x4_t *)v707 = v242; - *(float32x4_t *)v716 = v241; - *(float32x4_t *)v761 = v290; - *(float32x4_t *)v770 = v289; + vst1q_f32((float32_t *)v707, v242); + vst1q_f32((float32_t *)v716, v241); + vst1q_f32((float32_t *)v761, v290); + vst1q_f32((float32_t *)v770, v289); v5 += 2 * 1; v6 += 2 * 1; } @@ -3724,7 +3724,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu13(const armral_cmplx_f32_t *restrict x, float32x2_t v290 = (float32x2_t){v288, v289}; const float32x2_t *v909 = &v5[0]; float32x2_t *v919 = &v6[0]; - float32x4_t v1031 = *(const float32x4_t *)v800; + float32x4_t v1031 = vld1q_f32((const float32_t *)v800); float32x4_t v163 = vcombine_f32(v162, v162); float32x4_t v168 = vcombine_f32(v167, v167); float32x2_t v174 = vmul_f32(v291, v172); @@ -3767,7 +3767,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu13(const armral_cmplx_f32_t *restrict x, float32x2_t *v1000 = &v6[ostride * 4]; float32x2_t *v1009 = &v6[ostride * 3]; float32x2_t *v1018 = &v6[ostride * 2]; - float32x4_t v1055 = *(const float32x4_t *)v909; + float32x4_t v1055 = vld1q_f32((const float32_t *)v909); float32x4_t v176 = vcombine_f32(v174, v174); float32x4_t v184 = vcombine_f32(v182, v182); float32x4_t v192 = vcombine_f32(v190, v190); @@ -3780,17 +3780,17 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu13(const armral_cmplx_f32_t *restrict x, float32x4_t v278 = vcombine_f32(v276, v276); float32x4_t v286 = vcombine_f32(v284, v284); float32x4_t v294 = vcombine_f32(v292, v292); - float32x4_t v1033 = *(const float32x4_t *)v809; - float32x4_t v1035 = *(const float32x4_t *)v818; - float32x4_t v1037 = *(const float32x4_t *)v827; - float32x4_t v1039 = *(const float32x4_t *)v836; - float32x4_t v1041 = *(const float32x4_t *)v845; - float32x4_t v1043 = *(const float32x4_t *)v854; - float32x4_t v1045 = *(const float32x4_t *)v863; - float32x4_t v1047 = *(const float32x4_t *)v872; - float32x4_t v1049 = *(const float32x4_t *)v881; - float32x4_t v1051 = *(const float32x4_t *)v890; - float32x4_t v1053 = *(const float32x4_t *)v899; + float32x4_t v1033 = vld1q_f32((const float32_t *)v809); + float32x4_t v1035 = vld1q_f32((const float32_t *)v818); + float32x4_t v1037 = vld1q_f32((const float32_t *)v827); + float32x4_t v1039 = vld1q_f32((const float32_t *)v836); + float32x4_t v1041 = vld1q_f32((const float32_t *)v845); + float32x4_t v1043 = vld1q_f32((const float32_t *)v854); + float32x4_t v1045 = vld1q_f32((const float32_t *)v863); + float32x4_t v1047 = vld1q_f32((const float32_t *)v872); + float32x4_t v1049 = vld1q_f32((const float32_t *)v881); + float32x4_t v1051 = vld1q_f32((const float32_t *)v890); + float32x4_t v1053 = vld1q_f32((const float32_t *)v899); float32x4_t v35 = vaddq_f32(v1031, v1033); float32x4_t v50 = vaddq_f32(v1035, v1037); float32x4_t v65 = vaddq_f32(v1039, v1041); @@ -3877,7 +3877,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu13(const armral_cmplx_f32_t *restrict x, float32x4_t v312 = vsubq_f32(v263, v271); float32x4_t v313 = vsubq_f32(v279, v295); float32x4_t v314 = vaddq_f32(v287, v295); - *(float32x4_t *)v919 = v147; + vst1q_f32((float32_t *)v919, v147); float32x4_t v300 = vaddq_f32(v299, v169); float32x4_t v302 = vsubq_f32(v301, v169); float32x4_t v303 = vaddq_f32(v296, v213); @@ -3918,18 +3918,18 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu13(const armral_cmplx_f32_t *restrict x, float32x4_t v344 = vaddq_f32(v319, v328); float32x4_t v345 = vsubq_f32(v318, v326); float32x4_t v346 = vaddq_f32(v317, v324); - *(float32x4_t *)v928 = v335; - *(float32x4_t *)v937 = v336; - *(float32x4_t *)v946 = v337; - *(float32x4_t *)v955 = v338; - *(float32x4_t *)v964 = v339; - *(float32x4_t *)v973 = v340; - *(float32x4_t *)v982 = v341; - *(float32x4_t *)v991 = v342; - *(float32x4_t *)v1000 = v343; - *(float32x4_t *)v1009 = v344; - *(float32x4_t *)v1018 = v345; - *(float32x4_t *)v1027 = v346; + vst1q_f32((float32_t *)v928, v335); + vst1q_f32((float32_t *)v937, v336); + vst1q_f32((float32_t *)v946, v337); + vst1q_f32((float32_t *)v955, v338); + vst1q_f32((float32_t *)v964, v339); + vst1q_f32((float32_t *)v973, v340); + vst1q_f32((float32_t *)v982, v341); + vst1q_f32((float32_t *)v991, v342); + vst1q_f32((float32_t *)v1000, v343); + vst1q_f32((float32_t *)v1009, v344); + vst1q_f32((float32_t *)v1018, v345); + vst1q_f32((float32_t *)v1027, v346); v5 += 2 * 1; v6 += 2 * 1; } @@ -4565,7 +4565,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu14(const armral_cmplx_f32_t *restrict x, float32x2_t v294 = (float32x2_t){v292, v293}; const float32x2_t *v758 = &v5[0]; float32x2_t *v885 = &v6[0]; - float32x4_t v1024 = *(const float32x4_t *)v839; + float32x4_t v1024 = vld1q_f32((const float32_t *)v839); float32x4_t v251 = vcombine_f32(v250, v250); float32x4_t v256 = vcombine_f32(v255, v255); float32x4_t v261 = vcombine_f32(v260, v260); @@ -4598,23 +4598,23 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu14(const armral_cmplx_f32_t *restrict x, float32x2_t *v984 = &v6[ostride * 5]; float32x2_t *v993 = &v6[ostride * 6]; float32x2_t *v1002 = &v6[ostride * 13]; - float32x4_t v1006 = *(const float32x4_t *)v758; + float32x4_t v1006 = vld1q_f32((const float32_t *)v758); float32x4_t v274 = vcombine_f32(v272, v272); float32x4_t v282 = vcombine_f32(v280, v280); float32x4_t v290 = vcombine_f32(v288, v288); float32x4_t v298 = vcombine_f32(v296, v296); - float32x4_t v1008 = *(const float32x4_t *)v767; - float32x4_t v1010 = *(const float32x4_t *)v776; - float32x4_t v1012 = *(const float32x4_t *)v785; - float32x4_t v1014 = *(const float32x4_t *)v794; - float32x4_t v1016 = *(const float32x4_t *)v803; - float32x4_t v1018 = *(const float32x4_t *)v812; - float32x4_t v1020 = *(const float32x4_t *)v821; - float32x4_t v1022 = *(const float32x4_t *)v830; - float32x4_t v1026 = *(const float32x4_t *)v848; - float32x4_t v1028 = *(const float32x4_t *)v857; - float32x4_t v1030 = *(const float32x4_t *)v866; - float32x4_t v1032 = *(const float32x4_t *)v875; + float32x4_t v1008 = vld1q_f32((const float32_t *)v767); + float32x4_t v1010 = vld1q_f32((const float32_t *)v776); + float32x4_t v1012 = vld1q_f32((const float32_t *)v785); + float32x4_t v1014 = vld1q_f32((const float32_t *)v794); + float32x4_t v1016 = vld1q_f32((const float32_t *)v803); + float32x4_t v1018 = vld1q_f32((const float32_t *)v812); + float32x4_t v1020 = vld1q_f32((const float32_t *)v821); + float32x4_t v1022 = vld1q_f32((const float32_t *)v830); + float32x4_t v1026 = vld1q_f32((const float32_t *)v848); + float32x4_t v1028 = vld1q_f32((const float32_t *)v857); + float32x4_t v1030 = vld1q_f32((const float32_t *)v866); + float32x4_t v1032 = vld1q_f32((const float32_t *)v875); float32x4_t v35 = vaddq_f32(v1006, v1008); float32x4_t v36 = vsubq_f32(v1006, v1008); float32x4_t v51 = vaddq_f32(v1010, v1012); @@ -4689,8 +4689,8 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu14(const armral_cmplx_f32_t *restrict x, float32x4_t v207 = vaddq_f32(v141, v159); float32x4_t v275 = vmulq_f32(v273, v274); float32x4_t v300 = vaddq_f32(v234, v252); - *(float32x4_t *)v885 = v141; - *(float32x4_t *)v894 = v234; + vst1q_f32((float32_t *)v885, v141); + vst1q_f32((float32_t *)v894, v234); float32x4_t v208 = vaddq_f32(v207, v164); float32x4_t v210 = vsubq_f32(v207, v164); float32x4_t v212 = vsubq_f32(v207, v169); @@ -4727,18 +4727,18 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu14(const armral_cmplx_f32_t *restrict x, float32x4_t v316 = vsubq_f32(v304, v310); float32x4_t v317 = vaddq_f32(v306, v312); float32x4_t v318 = vsubq_f32(v306, v312); - *(float32x4_t *)v903 = v221; - *(float32x4_t *)v912 = v314; - *(float32x4_t *)v921 = v223; - *(float32x4_t *)v930 = v316; - *(float32x4_t *)v939 = v224; - *(float32x4_t *)v948 = v317; - *(float32x4_t *)v957 = v225; - *(float32x4_t *)v966 = v318; - *(float32x4_t *)v975 = v222; - *(float32x4_t *)v984 = v315; - *(float32x4_t *)v993 = v220; - *(float32x4_t *)v1002 = v313; + vst1q_f32((float32_t *)v903, v221); + vst1q_f32((float32_t *)v912, v314); + vst1q_f32((float32_t *)v921, v223); + vst1q_f32((float32_t *)v930, v316); + vst1q_f32((float32_t *)v939, v224); + vst1q_f32((float32_t *)v948, v317); + vst1q_f32((float32_t *)v957, v225); + vst1q_f32((float32_t *)v966, v318); + vst1q_f32((float32_t *)v975, v222); + vst1q_f32((float32_t *)v984, v315); + vst1q_f32((float32_t *)v993, v220); + vst1q_f32((float32_t *)v1002, v313); v5 += 2 * 1; v6 += 2 * 1; } @@ -5289,7 +5289,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu15(const armral_cmplx_f32_t *restrict x, float32x2_t v297 = (float32x2_t){v296, v296}; const float32x2_t *v796 = &v5[0]; float32x2_t *v914 = &v6[0]; - float32x4_t v1058 = *(const float32x4_t *)v841; + float32x4_t v1058 = vld1q_f32((const float32_t *)v841); float32x4_t v157 = vcombine_f32(v156, v156); float32x4_t v162 = vcombine_f32(v161, v161); float32x2_t v168 = vmul_f32(v280, v166); @@ -5333,7 +5333,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu15(const armral_cmplx_f32_t *restrict x, float32x2_t *v1022 = &v6[ostride * 9]; float32x2_t *v1031 = &v6[ostride * 4]; float32x2_t *v1040 = &v6[ostride * 14]; - float32x4_t v1048 = *(const float32x4_t *)v796; + float32x4_t v1048 = vld1q_f32((const float32_t *)v796); float32x4_t v170 = vcombine_f32(v168, v168); float32x4_t v178 = vcombine_f32(v176, v176); float32x4_t v186 = vcombine_f32(v184, v184); @@ -5343,19 +5343,19 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu15(const armral_cmplx_f32_t *restrict x, float32x4_t v267 = vcombine_f32(v265, v265); float32x4_t v275 = vcombine_f32(v273, v273); float32x4_t v283 = vcombine_f32(v281, v281); - float32x4_t v1044 = *(const float32x4_t *)v777; - float32x4_t v1046 = *(const float32x4_t *)v786; - float32x4_t v1050 = *(const float32x4_t *)v805; - float32x4_t v1052 = *(const float32x4_t *)v814; - float32x4_t v1054 = *(const float32x4_t *)v823; - float32x4_t v1056 = *(const float32x4_t *)v832; - float32x4_t v1060 = *(const float32x4_t *)v850; - float32x4_t v1062 = *(const float32x4_t *)v859; - float32x4_t v1064 = *(const float32x4_t *)v868; - float32x4_t v1066 = *(const float32x4_t *)v877; - float32x4_t v1068 = *(const float32x4_t *)v886; - float32x4_t v1070 = *(const float32x4_t *)v895; - float32x4_t v1072 = *(const float32x4_t *)v904; + float32x4_t v1044 = vld1q_f32((const float32_t *)v777); + float32x4_t v1046 = vld1q_f32((const float32_t *)v786); + float32x4_t v1050 = vld1q_f32((const float32_t *)v805); + float32x4_t v1052 = vld1q_f32((const float32_t *)v814); + float32x4_t v1054 = vld1q_f32((const float32_t *)v823); + float32x4_t v1056 = vld1q_f32((const float32_t *)v832); + float32x4_t v1060 = vld1q_f32((const float32_t *)v850); + float32x4_t v1062 = vld1q_f32((const float32_t *)v859); + float32x4_t v1064 = vld1q_f32((const float32_t *)v868); + float32x4_t v1066 = vld1q_f32((const float32_t *)v877); + float32x4_t v1068 = vld1q_f32((const float32_t *)v886); + float32x4_t v1070 = vld1q_f32((const float32_t *)v895); + float32x4_t v1072 = vld1q_f32((const float32_t *)v904); float32x4_t v35 = vaddq_f32(v1044, v1046); float32x4_t v36 = vsubq_f32(v1044, v1046); float32x4_t v59 = vaddq_f32(v1050, v1052); @@ -5428,7 +5428,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu15(const armral_cmplx_f32_t *restrict x, float32x4_t v248 = vaddq_f32(v235, v243); float32x4_t v268 = vmulq_f32(v266, v267); float32x4_t v309 = vaddq_f32(v148, v209); - *(float32x4_t *)v914 = v148; + vst1q_f32((float32_t *)v914, v148); float32x4_t v189 = vaddq_f32(v188, v163); float32x4_t v190 = vsubq_f32(v188, v163); float32x4_t v191 = vsubq_f32(v171, v179); @@ -5448,8 +5448,8 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu15(const armral_cmplx_f32_t *restrict x, float32x4_t v252 = vsubq_f32(v246, v248); float32x4_t v301 = vaddq_f32(v300, v284); float32x4_t v302 = vsubq_f32(v300, v284); - *(float32x4_t *)v923 = v311; - *(float32x4_t *)v932 = v310; + vst1q_f32((float32_t *)v923, v311); + vst1q_f32((float32_t *)v932, v310); float32x4_t v305 = vaddq_f32(v301, v303); float32x4_t v306 = vsubq_f32(v301, v303); float32x4_t v307 = vaddq_f32(v302, v304); @@ -5458,10 +5458,10 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu15(const armral_cmplx_f32_t *restrict x, float32x4_t v357 = vaddq_f32(v196, v252); float32x4_t v381 = vaddq_f32(v195, v251); float32x4_t v405 = vaddq_f32(v193, v249); - *(float32x4_t *)v941 = v194; - *(float32x4_t *)v968 = v196; - *(float32x4_t *)v995 = v195; - *(float32x4_t *)v1022 = v193; + vst1q_f32((float32_t *)v941, v194); + vst1q_f32((float32_t *)v968, v196); + vst1q_f32((float32_t *)v995, v195); + vst1q_f32((float32_t *)v1022, v193); float32x4_t v334 = vaddq_f32(v333, v306); float32x4_t v335 = vsubq_f32(v333, v306); float32x4_t v358 = vaddq_f32(v357, v308); @@ -5470,14 +5470,14 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu15(const armral_cmplx_f32_t *restrict x, float32x4_t v383 = vsubq_f32(v381, v307); float32x4_t v406 = vaddq_f32(v405, v305); float32x4_t v407 = vsubq_f32(v405, v305); - *(float32x4_t *)v950 = v335; - *(float32x4_t *)v959 = v334; - *(float32x4_t *)v977 = v359; - *(float32x4_t *)v986 = v358; - *(float32x4_t *)v1004 = v383; - *(float32x4_t *)v1013 = v382; - *(float32x4_t *)v1031 = v407; - *(float32x4_t *)v1040 = v406; + vst1q_f32((float32_t *)v950, v335); + vst1q_f32((float32_t *)v959, v334); + vst1q_f32((float32_t *)v977, v359); + vst1q_f32((float32_t *)v986, v358); + vst1q_f32((float32_t *)v1004, v383); + vst1q_f32((float32_t *)v1013, v382); + vst1q_f32((float32_t *)v1031, v407); + vst1q_f32((float32_t *)v1040, v406); v5 += 2 * 1; v6 += 2 * 1; } @@ -6054,7 +6054,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu16(const armral_cmplx_f32_t *restrict x, float32x2_t v284 = (float32x2_t){v283, v283}; const float32x2_t *v802 = &v5[0]; float32x2_t *v947 = &v6[0]; - float32x4_t v1102 = *(const float32x4_t *)v874; + float32x4_t v1102 = vld1q_f32((const float32_t *)v874); float32x2_t v231 = vmul_f32(v267, v229); float32x2_t v239 = vmul_f32(v267, v237); float32x4_t v246 = vcombine_f32(v245, v245); @@ -6092,26 +6092,26 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu16(const armral_cmplx_f32_t *restrict x, float32x2_t *v1064 = &v6[ostride * 13]; float32x2_t *v1073 = &v6[ostride * 14]; float32x2_t *v1082 = &v6[ostride * 15]; - float32x4_t v1086 = *(const float32x4_t *)v802; + float32x4_t v1086 = vld1q_f32((const float32_t *)v802); float32x4_t v233 = vcombine_f32(v231, v231); float32x4_t v241 = vcombine_f32(v239, v239); float32x4_t v254 = vcombine_f32(v252, v252); float32x4_t v262 = vcombine_f32(v260, v260); float32x4_t v270 = vcombine_f32(v268, v268); - float32x4_t v1088 = *(const float32x4_t *)v811; - float32x4_t v1090 = *(const float32x4_t *)v820; - float32x4_t v1092 = *(const float32x4_t *)v829; - float32x4_t v1094 = *(const float32x4_t *)v838; - float32x4_t v1096 = *(const float32x4_t *)v847; - float32x4_t v1098 = *(const float32x4_t *)v856; - float32x4_t v1100 = *(const float32x4_t *)v865; - float32x4_t v1104 = *(const float32x4_t *)v883; - float32x4_t v1106 = *(const float32x4_t *)v892; - float32x4_t v1108 = *(const float32x4_t *)v901; - float32x4_t v1110 = *(const float32x4_t *)v910; - float32x4_t v1112 = *(const float32x4_t *)v919; - float32x4_t v1114 = *(const float32x4_t *)v928; - float32x4_t v1116 = *(const float32x4_t *)v937; + float32x4_t v1088 = vld1q_f32((const float32_t *)v811); + float32x4_t v1090 = vld1q_f32((const float32_t *)v820); + float32x4_t v1092 = vld1q_f32((const float32_t *)v829); + float32x4_t v1094 = vld1q_f32((const float32_t *)v838); + float32x4_t v1096 = vld1q_f32((const float32_t *)v847); + float32x4_t v1098 = vld1q_f32((const float32_t *)v856); + float32x4_t v1100 = vld1q_f32((const float32_t *)v865); + float32x4_t v1104 = vld1q_f32((const float32_t *)v883); + float32x4_t v1106 = vld1q_f32((const float32_t *)v892); + float32x4_t v1108 = vld1q_f32((const float32_t *)v901); + float32x4_t v1110 = vld1q_f32((const float32_t *)v910); + float32x4_t v1112 = vld1q_f32((const float32_t *)v919); + float32x4_t v1114 = vld1q_f32((const float32_t *)v928); + float32x4_t v1116 = vld1q_f32((const float32_t *)v937); float32x4_t v35 = vaddq_f32(v1086, v1088); float32x4_t v36 = vsubq_f32(v1086, v1088); float32x4_t v51 = vaddq_f32(v1090, v1092); @@ -6183,8 +6183,8 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu16(const armral_cmplx_f32_t *restrict x, float32x4_t v304 = vsubq_f32(v286, v276); float32x4_t v305 = vsubq_f32(v276, v281); float32x4_t v306 = vsubq_f32(v276, v286); - *(float32x4_t *)v947 = v161; - *(float32x4_t *)v1019 = v162; + vst1q_f32((float32_t *)v947, v161); + vst1q_f32((float32_t *)v1019, v162); float32x4_t v287 = vaddq_f32(v158, v195); float32x4_t v288 = vsubq_f32(v158, v195); float32x4_t v290 = vaddq_f32(v208, v216); @@ -6209,8 +6209,8 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu16(const armral_cmplx_f32_t *restrict x, float32x4_t v320 = vsubq_f32(v302, v304); float32x4_t v321 = vaddq_f32(v302, v300); float32x4_t v322 = vsubq_f32(v302, v300); - *(float32x4_t *)v983 = v288; - *(float32x4_t *)v1055 = v287; + vst1q_f32((float32_t *)v983, v288); + vst1q_f32((float32_t *)v1055, v287); float32x4_t v323 = vaddq_f32(v307, v317); float32x4_t v324 = vaddq_f32(v308, v318); float32x4_t v325 = vsubq_f32(v309, v318); @@ -6219,18 +6219,18 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu16(const armral_cmplx_f32_t *restrict x, float32x4_t v328 = vaddq_f32(v312, v320); float32x4_t v329 = vsubq_f32(v313, v322); float32x4_t v330 = vsubq_f32(v314, v321); - *(float32x4_t *)v965 = v296; - *(float32x4_t *)v1001 = v295; - *(float32x4_t *)v1037 = v294; - *(float32x4_t *)v1073 = v293; - *(float32x4_t *)v956 = v326; - *(float32x4_t *)v974 = v329; - *(float32x4_t *)v992 = v330; - *(float32x4_t *)v1010 = v325; - *(float32x4_t *)v1028 = v324; - *(float32x4_t *)v1046 = v327; - *(float32x4_t *)v1064 = v328; - *(float32x4_t *)v1082 = v323; + vst1q_f32((float32_t *)v965, v296); + vst1q_f32((float32_t *)v1001, v295); + vst1q_f32((float32_t *)v1037, v294); + vst1q_f32((float32_t *)v1073, v293); + vst1q_f32((float32_t *)v956, v326); + vst1q_f32((float32_t *)v974, v329); + vst1q_f32((float32_t *)v992, v330); + vst1q_f32((float32_t *)v1010, v325); + vst1q_f32((float32_t *)v1028, v324); + vst1q_f32((float32_t *)v1046, v327); + vst1q_f32((float32_t *)v1064, v328); + vst1q_f32((float32_t *)v1082, v323); v5 += 2 * 1; v6 += 2 * 1; } @@ -6844,7 +6844,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu17(const armral_cmplx_f32_t *restrict x, float32x2_t v444 = (float32x2_t){v442, v443}; const float32x2_t *v1356 = &v5[0]; float32x2_t *v1366 = &v6[0]; - float32x4_t v1514 = *(const float32x4_t *)v1211; + float32x4_t v1514 = vld1q_f32((const float32_t *)v1211); float32x4_t v215 = vcombine_f32(v214, v214); float32x4_t v220 = vcombine_f32(v219, v219); float32x4_t v225 = vcombine_f32(v224, v224); @@ -6910,7 +6910,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu17(const armral_cmplx_f32_t *restrict x, float32x2_t *v1492 = &v6[ostride * 10]; float32x2_t *v1501 = &v6[ostride * 8]; float32x2_t *v1510 = &v6[ostride * 9]; - float32x4_t v1546 = *(const float32x4_t *)v1356; + float32x4_t v1546 = vld1q_f32((const float32_t *)v1356); float32x4_t v288 = vcombine_f32(v286, v286); float32x4_t v296 = vcombine_f32(v294, v294); float32x4_t v304 = vcombine_f32(v302, v302); @@ -6932,21 +6932,21 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu17(const armral_cmplx_f32_t *restrict x, float32x4_t v432 = vcombine_f32(v430, v430); float32x4_t v440 = vcombine_f32(v438, v438); float32x4_t v448 = vcombine_f32(v446, v446); - float32x4_t v1516 = *(const float32x4_t *)v1220; - float32x4_t v1518 = *(const float32x4_t *)v1229; - float32x4_t v1520 = *(const float32x4_t *)v1238; - float32x4_t v1522 = *(const float32x4_t *)v1247; - float32x4_t v1524 = *(const float32x4_t *)v1256; - float32x4_t v1526 = *(const float32x4_t *)v1265; - float32x4_t v1528 = *(const float32x4_t *)v1274; - float32x4_t v1530 = *(const float32x4_t *)v1283; - float32x4_t v1532 = *(const float32x4_t *)v1292; - float32x4_t v1534 = *(const float32x4_t *)v1301; - float32x4_t v1536 = *(const float32x4_t *)v1310; - float32x4_t v1538 = *(const float32x4_t *)v1319; - float32x4_t v1540 = *(const float32x4_t *)v1328; - float32x4_t v1542 = *(const float32x4_t *)v1337; - float32x4_t v1544 = *(const float32x4_t *)v1346; + float32x4_t v1516 = vld1q_f32((const float32_t *)v1220); + float32x4_t v1518 = vld1q_f32((const float32_t *)v1229); + float32x4_t v1520 = vld1q_f32((const float32_t *)v1238); + float32x4_t v1522 = vld1q_f32((const float32_t *)v1247); + float32x4_t v1524 = vld1q_f32((const float32_t *)v1256); + float32x4_t v1526 = vld1q_f32((const float32_t *)v1265); + float32x4_t v1528 = vld1q_f32((const float32_t *)v1274); + float32x4_t v1530 = vld1q_f32((const float32_t *)v1283); + float32x4_t v1532 = vld1q_f32((const float32_t *)v1292); + float32x4_t v1534 = vld1q_f32((const float32_t *)v1301); + float32x4_t v1536 = vld1q_f32((const float32_t *)v1310); + float32x4_t v1538 = vld1q_f32((const float32_t *)v1319); + float32x4_t v1540 = vld1q_f32((const float32_t *)v1328); + float32x4_t v1542 = vld1q_f32((const float32_t *)v1337); + float32x4_t v1544 = vld1q_f32((const float32_t *)v1346); float32x4_t v35 = vaddq_f32(v1514, v1516); float32x4_t v36 = vsubq_f32(v1514, v1516); float32x4_t v51 = vaddq_f32(v1518, v1520); @@ -7071,7 +7071,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu17(const armral_cmplx_f32_t *restrict x, float32x4_t v456 = vsubq_f32(v281, v241); float32x4_t v457 = vaddq_f32(v281, v236); float32x4_t v458 = vaddq_f32(v246, v206); - *(float32x4_t *)v1366 = v206; + vst1q_f32((float32_t *)v1366, v206); float32x4_t v197 = vsubq_f32(v196, v148); float32x4_t v431 = vrev64q_f32(v193); float32x4_t v459 = vaddq_f32(v251, v458); @@ -7140,8 +7140,8 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu17(const armral_cmplx_f32_t *restrict x, float32x4_t v504 = vaddq_f32(v496, v502); float32x4_t v510 = vaddq_f32(v509, v502); float32x4_t v521 = vaddq_f32(v520, v502); - *(float32x4_t *)v1411 = v563; - *(float32x4_t *)v1420 = v571; + vst1q_f32((float32_t *)v1411, v563); + vst1q_f32((float32_t *)v1420, v571); float32x4_t v506 = vaddq_f32(v505, v497); float32x4_t v508 = vaddq_f32(v507, v500); float32x4_t v512 = vsubq_f32(v511, v504); @@ -7159,24 +7159,24 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu17(const armral_cmplx_f32_t *restrict x, float32x4_t v603 = vsubq_f32(v472, v512); float32x4_t v611 = vaddq_f32(v470, v508); float32x4_t v619 = vsubq_f32(v470, v508); - *(float32x4_t *)v1393 = v547; - *(float32x4_t *)v1402 = v555; - *(float32x4_t *)v1501 = v643; - *(float32x4_t *)v1510 = v651; + vst1q_f32((float32_t *)v1393, v547); + vst1q_f32((float32_t *)v1402, v555); + vst1q_f32((float32_t *)v1501, v643); + vst1q_f32((float32_t *)v1510, v651); float32x4_t v579 = vaddq_f32(v473, v515); float32x4_t v587 = vsubq_f32(v473, v515); float32x4_t v627 = vaddq_f32(v474, v518); float32x4_t v635 = vsubq_f32(v474, v518); - *(float32x4_t *)v1375 = v531; - *(float32x4_t *)v1384 = v539; - *(float32x4_t *)v1447 = v595; - *(float32x4_t *)v1456 = v603; - *(float32x4_t *)v1465 = v611; - *(float32x4_t *)v1474 = v619; - *(float32x4_t *)v1429 = v579; - *(float32x4_t *)v1438 = v587; - *(float32x4_t *)v1483 = v627; - *(float32x4_t *)v1492 = v635; + vst1q_f32((float32_t *)v1375, v531); + vst1q_f32((float32_t *)v1384, v539); + vst1q_f32((float32_t *)v1447, v595); + vst1q_f32((float32_t *)v1456, v603); + vst1q_f32((float32_t *)v1465, v611); + vst1q_f32((float32_t *)v1474, v619); + vst1q_f32((float32_t *)v1429, v579); + vst1q_f32((float32_t *)v1438, v587); + vst1q_f32((float32_t *)v1483, v627); + vst1q_f32((float32_t *)v1492, v635); v5 += 2 * 1; v6 += 2 * 1; } @@ -8135,7 +8135,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu18(const armral_cmplx_f32_t *restrict x, float32x2_t v364 = (float32x2_t){v362, v363}; const float32x2_t *v946 = &v5[0]; float32x2_t *v1109 = &v6[0]; - float32x4_t v1288 = *(const float32x4_t *)v1045; + float32x4_t v1288 = vld1q_f32((const float32_t *)v1045); float32x4_t v308 = vcombine_f32(v307, v307); float32x4_t v321 = vcombine_f32(v320, v320); float32x2_t v327 = vmul_f32(v365, v325); @@ -8177,27 +8177,27 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu18(const armral_cmplx_f32_t *restrict x, float32x2_t *v1244 = &v6[ostride * 7]; float32x2_t *v1253 = &v6[ostride * 8]; float32x2_t *v1262 = &v6[ostride * 17]; - float32x4_t v1266 = *(const float32x4_t *)v946; + float32x4_t v1266 = vld1q_f32((const float32_t *)v946); float32x4_t v329 = vcombine_f32(v327, v327); float32x4_t v352 = vcombine_f32(v350, v350); float32x4_t v360 = vcombine_f32(v358, v358); float32x4_t v368 = vcombine_f32(v366, v366); - float32x4_t v1268 = *(const float32x4_t *)v955; - float32x4_t v1270 = *(const float32x4_t *)v964; - float32x4_t v1272 = *(const float32x4_t *)v973; - float32x4_t v1274 = *(const float32x4_t *)v982; - float32x4_t v1276 = *(const float32x4_t *)v991; - float32x4_t v1278 = *(const float32x4_t *)v1000; - float32x4_t v1280 = *(const float32x4_t *)v1009; - float32x4_t v1282 = *(const float32x4_t *)v1018; - float32x4_t v1284 = *(const float32x4_t *)v1027; - float32x4_t v1286 = *(const float32x4_t *)v1036; - float32x4_t v1290 = *(const float32x4_t *)v1054; - float32x4_t v1292 = *(const float32x4_t *)v1063; - float32x4_t v1294 = *(const float32x4_t *)v1072; - float32x4_t v1296 = *(const float32x4_t *)v1081; - float32x4_t v1298 = *(const float32x4_t *)v1090; - float32x4_t v1300 = *(const float32x4_t *)v1099; + float32x4_t v1268 = vld1q_f32((const float32_t *)v955); + float32x4_t v1270 = vld1q_f32((const float32_t *)v964); + float32x4_t v1272 = vld1q_f32((const float32_t *)v973); + float32x4_t v1274 = vld1q_f32((const float32_t *)v982); + float32x4_t v1276 = vld1q_f32((const float32_t *)v991); + float32x4_t v1278 = vld1q_f32((const float32_t *)v1000); + float32x4_t v1280 = vld1q_f32((const float32_t *)v1009); + float32x4_t v1282 = vld1q_f32((const float32_t *)v1018); + float32x4_t v1284 = vld1q_f32((const float32_t *)v1027); + float32x4_t v1286 = vld1q_f32((const float32_t *)v1036); + float32x4_t v1290 = vld1q_f32((const float32_t *)v1054); + float32x4_t v1292 = vld1q_f32((const float32_t *)v1063); + float32x4_t v1294 = vld1q_f32((const float32_t *)v1072); + float32x4_t v1296 = vld1q_f32((const float32_t *)v1081); + float32x4_t v1298 = vld1q_f32((const float32_t *)v1090); + float32x4_t v1300 = vld1q_f32((const float32_t *)v1099); float32x4_t v35 = vaddq_f32(v1266, v1268); float32x4_t v36 = vsubq_f32(v1266, v1268); float32x4_t v51 = vaddq_f32(v1270, v1272); @@ -8304,8 +8304,8 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu18(const armral_cmplx_f32_t *restrict x, float32x4_t v384 = vaddq_f32(v383, v361); float32x4_t v386 = vaddq_f32(v385, v369); float32x4_t v388 = vsubq_f32(v387, v369); - *(float32x4_t *)v1109 = v176; - *(float32x4_t *)v1118 = v291; + vst1q_f32((float32_t *)v1109, v176); + vst1q_f32((float32_t *)v1118, v291); float32x4_t v257 = vaddq_f32(v176, v256); float32x4_t v261 = vaddq_f32(v260, v255); float32x4_t v372 = vaddq_f32(v291, v371); @@ -8326,10 +8326,10 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu18(const armral_cmplx_f32_t *restrict x, float32x4_t v378 = vaddq_f32(v377, v340); float32x4_t v380 = vaddq_f32(v379, v345); float32x4_t v382 = vsubq_f32(v381, v345); - *(float32x4_t *)v1163 = v259; - *(float32x4_t *)v1172 = v374; - *(float32x4_t *)v1217 = v258; - *(float32x4_t *)v1226 = v373; + vst1q_f32((float32_t *)v1163, v259); + vst1q_f32((float32_t *)v1172, v374); + vst1q_f32((float32_t *)v1217, v258); + vst1q_f32((float32_t *)v1226, v373); float32x4_t v274 = vaddq_f32(v263, v269); float32x4_t v275 = vsubq_f32(v263, v269); float32x4_t v276 = vaddq_f32(v265, v271); @@ -8342,18 +8342,18 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu18(const armral_cmplx_f32_t *restrict x, float32x4_t v392 = vsubq_f32(v380, v386); float32x4_t v393 = vaddq_f32(v382, v388); float32x4_t v394 = vsubq_f32(v382, v388); - *(float32x4_t *)v1127 = v275; - *(float32x4_t *)v1136 = v390; - *(float32x4_t *)v1145 = v276; - *(float32x4_t *)v1154 = v391; - *(float32x4_t *)v1181 = v279; - *(float32x4_t *)v1190 = v394; - *(float32x4_t *)v1199 = v278; - *(float32x4_t *)v1208 = v393; - *(float32x4_t *)v1235 = v277; - *(float32x4_t *)v1244 = v392; - *(float32x4_t *)v1253 = v274; - *(float32x4_t *)v1262 = v389; + vst1q_f32((float32_t *)v1127, v275); + vst1q_f32((float32_t *)v1136, v390); + vst1q_f32((float32_t *)v1145, v276); + vst1q_f32((float32_t *)v1154, v391); + vst1q_f32((float32_t *)v1181, v279); + vst1q_f32((float32_t *)v1190, v394); + vst1q_f32((float32_t *)v1199, v278); + vst1q_f32((float32_t *)v1208, v393); + vst1q_f32((float32_t *)v1235, v277); + vst1q_f32((float32_t *)v1244, v392); + vst1q_f32((float32_t *)v1253, v274); + vst1q_f32((float32_t *)v1262, v389); v5 += 2 * 1; v6 += 2 * 1; } @@ -9076,7 +9076,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu19(const armral_cmplx_f32_t *restrict x, float32x2_t v483 = (float32x2_t){v481, v482}; const float32x2_t *v1495 = &v5[0]; float32x2_t *v1505 = &v6[0]; - float32x4_t v1671 = *(const float32x4_t *)v1332; + float32x4_t v1671 = vld1q_f32((const float32_t *)v1332); float32x4_t v245 = vcombine_f32(v244, v244); float32x4_t v250 = vcombine_f32(v249, v249); float32x4_t v255 = vcombine_f32(v254, v254); @@ -9149,7 +9149,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu19(const armral_cmplx_f32_t *restrict x, float32x2_t *v1649 = &v6[ostride * 11]; float32x2_t *v1658 = &v6[ostride * 9]; float32x2_t *v1667 = &v6[ostride * 10]; - float32x4_t v1707 = *(const float32x4_t *)v1495; + float32x4_t v1707 = vld1q_f32((const float32_t *)v1495); float32x4_t v343 = vcombine_f32(v341, v341); float32x4_t v351 = vcombine_f32(v349, v349); float32x4_t v359 = vcombine_f32(v357, v357); @@ -9169,23 +9169,23 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu19(const armral_cmplx_f32_t *restrict x, float32x4_t v471 = vcombine_f32(v469, v469); float32x4_t v479 = vcombine_f32(v477, v477); float32x4_t v487 = vcombine_f32(v485, v485); - float32x4_t v1673 = *(const float32x4_t *)v1341; - float32x4_t v1675 = *(const float32x4_t *)v1350; - float32x4_t v1677 = *(const float32x4_t *)v1359; - float32x4_t v1679 = *(const float32x4_t *)v1368; - float32x4_t v1681 = *(const float32x4_t *)v1377; - float32x4_t v1683 = *(const float32x4_t *)v1386; - float32x4_t v1685 = *(const float32x4_t *)v1395; - float32x4_t v1687 = *(const float32x4_t *)v1404; - float32x4_t v1689 = *(const float32x4_t *)v1413; - float32x4_t v1691 = *(const float32x4_t *)v1422; - float32x4_t v1693 = *(const float32x4_t *)v1431; - float32x4_t v1695 = *(const float32x4_t *)v1440; - float32x4_t v1697 = *(const float32x4_t *)v1449; - float32x4_t v1699 = *(const float32x4_t *)v1458; - float32x4_t v1701 = *(const float32x4_t *)v1467; - float32x4_t v1703 = *(const float32x4_t *)v1476; - float32x4_t v1705 = *(const float32x4_t *)v1485; + float32x4_t v1673 = vld1q_f32((const float32_t *)v1341); + float32x4_t v1675 = vld1q_f32((const float32_t *)v1350); + float32x4_t v1677 = vld1q_f32((const float32_t *)v1359); + float32x4_t v1679 = vld1q_f32((const float32_t *)v1368); + float32x4_t v1681 = vld1q_f32((const float32_t *)v1377); + float32x4_t v1683 = vld1q_f32((const float32_t *)v1386); + float32x4_t v1685 = vld1q_f32((const float32_t *)v1395); + float32x4_t v1687 = vld1q_f32((const float32_t *)v1404); + float32x4_t v1689 = vld1q_f32((const float32_t *)v1413); + float32x4_t v1691 = vld1q_f32((const float32_t *)v1422); + float32x4_t v1693 = vld1q_f32((const float32_t *)v1431); + float32x4_t v1695 = vld1q_f32((const float32_t *)v1440); + float32x4_t v1697 = vld1q_f32((const float32_t *)v1449); + float32x4_t v1699 = vld1q_f32((const float32_t *)v1458); + float32x4_t v1701 = vld1q_f32((const float32_t *)v1467); + float32x4_t v1703 = vld1q_f32((const float32_t *)v1476); + float32x4_t v1705 = vld1q_f32((const float32_t *)v1485); float32x4_t v35 = vaddq_f32(v1671, v1673); float32x4_t v36 = vsubq_f32(v1671, v1673); float32x4_t v51 = vaddq_f32(v1675, v1677); @@ -9334,7 +9334,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu19(const armral_cmplx_f32_t *restrict x, float32x4_t v515 = vaddq_f32(v246, v188); float32x4_t v520 = vaddq_f32(v352, v360); float32x4_t v521 = vaddq_f32(v376, v384); - *(float32x4_t *)v1505 = v188; + vst1q_f32((float32_t *)v1505, v188); float32x4_t v464 = vmulq_f32(v462, v463); float32x4_t v491 = vaddq_f32(v316, v321); float32x4_t v495 = vaddq_f32(v311, v321); @@ -9418,34 +9418,34 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu19(const armral_cmplx_f32_t *restrict x, float32x4_t v653 = vsubq_f32(v555, v567); float32x4_t v693 = vsubq_f32(v559, v571); float32x4_t v701 = vaddq_f32(v559, v571); - *(float32x4_t *)v1532 = v597; - *(float32x4_t *)v1541 = v605; - *(float32x4_t *)v1550 = v613; - *(float32x4_t *)v1559 = v621; + vst1q_f32((float32_t *)v1532, v597); + vst1q_f32((float32_t *)v1541, v605); + vst1q_f32((float32_t *)v1550, v613); + vst1q_f32((float32_t *)v1559, v621); float32x4_t v563 = vaddq_f32(v562, v546); float32x4_t v565 = vaddq_f32(v564, v548); float32x4_t v661 = vaddq_f32(v557, v569); float32x4_t v669 = vsubq_f32(v557, v569); float32x4_t v677 = vaddq_f32(v556, v568); float32x4_t v685 = vsubq_f32(v556, v568); - *(float32x4_t *)v1568 = v629; - *(float32x4_t *)v1577 = v637; - *(float32x4_t *)v1586 = v645; - *(float32x4_t *)v1595 = v653; - *(float32x4_t *)v1640 = v693; - *(float32x4_t *)v1649 = v701; + vst1q_f32((float32_t *)v1568, v629); + vst1q_f32((float32_t *)v1577, v637); + vst1q_f32((float32_t *)v1586, v645); + vst1q_f32((float32_t *)v1595, v653); + vst1q_f32((float32_t *)v1640, v693); + vst1q_f32((float32_t *)v1649, v701); float32x4_t v581 = vaddq_f32(v551, v563); float32x4_t v589 = vsubq_f32(v551, v563); float32x4_t v709 = vaddq_f32(v553, v565); float32x4_t v717 = vsubq_f32(v553, v565); - *(float32x4_t *)v1604 = v661; - *(float32x4_t *)v1613 = v669; - *(float32x4_t *)v1622 = v677; - *(float32x4_t *)v1631 = v685; - *(float32x4_t *)v1514 = v581; - *(float32x4_t *)v1523 = v589; - *(float32x4_t *)v1658 = v709; - *(float32x4_t *)v1667 = v717; + vst1q_f32((float32_t *)v1604, v661); + vst1q_f32((float32_t *)v1613, v669); + vst1q_f32((float32_t *)v1622, v677); + vst1q_f32((float32_t *)v1631, v685); + vst1q_f32((float32_t *)v1514, v581); + vst1q_f32((float32_t *)v1523, v589); + vst1q_f32((float32_t *)v1658, v709); + vst1q_f32((float32_t *)v1667, v717); v5 += 2 * 1; v6 += 2 * 1; } @@ -10521,7 +10521,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu20(const armral_cmplx_f32_t *restrict x, float32x2_t v403 = (float32x2_t){v402, v402}; const float32x2_t *v1024 = &v5[0]; float32x2_t *v1205 = &v6[0]; - float32x4_t v1416 = *(const float32x4_t *)v1186; + float32x4_t v1416 = vld1q_f32((const float32_t *)v1186); float32x4_t v319 = vcombine_f32(v318, v318); float32x4_t v324 = vcombine_f32(v323, v323); float32x2_t v330 = vmul_f32(v386, v328); @@ -10569,31 +10569,31 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu20(const armral_cmplx_f32_t *restrict x, float32x2_t *v1358 = &v6[ostride * 9]; float32x2_t *v1367 = &v6[ostride * 14]; float32x2_t *v1376 = &v6[ostride * 19]; - float32x4_t v1380 = *(const float32x4_t *)v1024; + float32x4_t v1380 = vld1q_f32((const float32_t *)v1024); float32x4_t v332 = vcombine_f32(v330, v330); float32x4_t v340 = vcombine_f32(v338, v338); float32x4_t v348 = vcombine_f32(v346, v346); float32x4_t v373 = vcombine_f32(v371, v371); float32x4_t v381 = vcombine_f32(v379, v379); float32x4_t v389 = vcombine_f32(v387, v387); - float32x4_t v1382 = *(const float32x4_t *)v1033; - float32x4_t v1384 = *(const float32x4_t *)v1042; - float32x4_t v1386 = *(const float32x4_t *)v1051; - float32x4_t v1388 = *(const float32x4_t *)v1060; - float32x4_t v1390 = *(const float32x4_t *)v1069; - float32x4_t v1392 = *(const float32x4_t *)v1078; - float32x4_t v1394 = *(const float32x4_t *)v1087; - float32x4_t v1396 = *(const float32x4_t *)v1096; - float32x4_t v1398 = *(const float32x4_t *)v1105; - float32x4_t v1400 = *(const float32x4_t *)v1114; - float32x4_t v1402 = *(const float32x4_t *)v1123; - float32x4_t v1404 = *(const float32x4_t *)v1132; - float32x4_t v1406 = *(const float32x4_t *)v1141; - float32x4_t v1408 = *(const float32x4_t *)v1150; - float32x4_t v1410 = *(const float32x4_t *)v1159; - float32x4_t v1412 = *(const float32x4_t *)v1168; - float32x4_t v1414 = *(const float32x4_t *)v1177; - float32x4_t v1418 = *(const float32x4_t *)v1195; + float32x4_t v1382 = vld1q_f32((const float32_t *)v1033); + float32x4_t v1384 = vld1q_f32((const float32_t *)v1042); + float32x4_t v1386 = vld1q_f32((const float32_t *)v1051); + float32x4_t v1388 = vld1q_f32((const float32_t *)v1060); + float32x4_t v1390 = vld1q_f32((const float32_t *)v1069); + float32x4_t v1392 = vld1q_f32((const float32_t *)v1078); + float32x4_t v1394 = vld1q_f32((const float32_t *)v1087); + float32x4_t v1396 = vld1q_f32((const float32_t *)v1096); + float32x4_t v1398 = vld1q_f32((const float32_t *)v1105); + float32x4_t v1400 = vld1q_f32((const float32_t *)v1114); + float32x4_t v1402 = vld1q_f32((const float32_t *)v1123); + float32x4_t v1404 = vld1q_f32((const float32_t *)v1132); + float32x4_t v1406 = vld1q_f32((const float32_t *)v1141); + float32x4_t v1408 = vld1q_f32((const float32_t *)v1150); + float32x4_t v1410 = vld1q_f32((const float32_t *)v1159); + float32x4_t v1412 = vld1q_f32((const float32_t *)v1168); + float32x4_t v1414 = vld1q_f32((const float32_t *)v1177); + float32x4_t v1418 = vld1q_f32((const float32_t *)v1195); float32x4_t v35 = vaddq_f32(v1380, v1382); float32x4_t v36 = vsubq_f32(v1380, v1382); float32x4_t v51 = vaddq_f32(v1384, v1386); @@ -10698,8 +10698,8 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu20(const armral_cmplx_f32_t *restrict x, float32x4_t v353 = vsubq_f32(v333, v341); float32x4_t v354 = vaddq_f32(v341, v349); float32x4_t v374 = vmulq_f32(v372, v373); - *(float32x4_t *)v1205 = v198; - *(float32x4_t *)v1223 = v254; + vst1q_f32((float32_t *)v1205, v198); + vst1q_f32((float32_t *)v1223, v254); float32x4_t v239 = vaddq_f32(v238, v213); float32x4_t v240 = vsubq_f32(v238, v213); float32x4_t v241 = vsubq_f32(v221, v229); @@ -10725,20 +10725,20 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu20(const armral_cmplx_f32_t *restrict x, float32x4_t v302 = vsubq_f32(v296, v298); float32x4_t v407 = vaddq_f32(v406, v390); float32x4_t v408 = vsubq_f32(v406, v390); - *(float32x4_t *)v1214 = v416; - *(float32x4_t *)v1232 = v415; + vst1q_f32((float32_t *)v1214, v416); + vst1q_f32((float32_t *)v1232, v415); float32x4_t v411 = vaddq_f32(v407, v409); float32x4_t v412 = vsubq_f32(v407, v409); float32x4_t v413 = vaddq_f32(v408, v410); float32x4_t v414 = vsubq_f32(v408, v410); - *(float32x4_t *)v1241 = v244; - *(float32x4_t *)v1259 = v300; - *(float32x4_t *)v1277 = v246; - *(float32x4_t *)v1295 = v302; - *(float32x4_t *)v1313 = v245; - *(float32x4_t *)v1331 = v301; - *(float32x4_t *)v1349 = v243; - *(float32x4_t *)v1367 = v299; + vst1q_f32((float32_t *)v1241, v244); + vst1q_f32((float32_t *)v1259, v300); + vst1q_f32((float32_t *)v1277, v246); + vst1q_f32((float32_t *)v1295, v302); + vst1q_f32((float32_t *)v1313, v245); + vst1q_f32((float32_t *)v1331, v301); + vst1q_f32((float32_t *)v1349, v243); + vst1q_f32((float32_t *)v1367, v299); float32x4_t v445 = vaddq_f32(v356, v412); float32x4_t v446 = vsubq_f32(v356, v412); float32x4_t v475 = vaddq_f32(v358, v414); @@ -10747,14 +10747,14 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu20(const armral_cmplx_f32_t *restrict x, float32x4_t v506 = vsubq_f32(v357, v413); float32x4_t v535 = vaddq_f32(v355, v411); float32x4_t v536 = vsubq_f32(v355, v411); - *(float32x4_t *)v1250 = v446; - *(float32x4_t *)v1268 = v445; - *(float32x4_t *)v1286 = v476; - *(float32x4_t *)v1304 = v475; - *(float32x4_t *)v1322 = v506; - *(float32x4_t *)v1340 = v505; - *(float32x4_t *)v1358 = v536; - *(float32x4_t *)v1376 = v535; + vst1q_f32((float32_t *)v1250, v446); + vst1q_f32((float32_t *)v1268, v445); + vst1q_f32((float32_t *)v1286, v476); + vst1q_f32((float32_t *)v1304, v475); + vst1q_f32((float32_t *)v1322, v506); + vst1q_f32((float32_t *)v1340, v505); + vst1q_f32((float32_t *)v1358, v536); + vst1q_f32((float32_t *)v1376, v535); v5 += 2 * 1; v6 += 2 * 1; } @@ -11469,7 +11469,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu21(const armral_cmplx_f32_t *restrict x, float32x2_t v449 = (float32x2_t){v448, v448}; const float32x2_t *v1183 = &v5[0]; float32x2_t *v1355 = &v6[0]; - float32x4_t v1569 = *(const float32x4_t *)v1300; + float32x4_t v1569 = vld1q_f32((const float32_t *)v1300); float32x4_t v214 = vcombine_f32(v213, v213); float32x4_t v219 = vcombine_f32(v218, v218); float32x4_t v224 = vcombine_f32(v223, v223); @@ -11534,7 +11534,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu21(const armral_cmplx_f32_t *restrict x, float32x2_t *v1517 = &v6[ostride * 6]; float32x2_t *v1526 = &v6[ostride * 13]; float32x2_t *v1535 = &v6[ostride * 20]; - float32x4_t v1543 = *(const float32x4_t *)v1183; + float32x4_t v1543 = vld1q_f32((const float32_t *)v1183); float32x4_t v237 = vcombine_f32(v235, v235); float32x4_t v245 = vcombine_f32(v243, v243); float32x4_t v253 = vcombine_f32(v251, v251); @@ -11548,25 +11548,25 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu21(const armral_cmplx_f32_t *restrict x, float32x4_t v414 = vcombine_f32(v412, v412); float32x4_t v422 = vcombine_f32(v420, v420); float32x4_t v430 = vcombine_f32(v428, v428); - float32x4_t v1539 = *(const float32x4_t *)v1164; - float32x4_t v1541 = *(const float32x4_t *)v1173; - float32x4_t v1545 = *(const float32x4_t *)v1192; - float32x4_t v1547 = *(const float32x4_t *)v1201; - float32x4_t v1549 = *(const float32x4_t *)v1210; - float32x4_t v1551 = *(const float32x4_t *)v1219; - float32x4_t v1553 = *(const float32x4_t *)v1228; - float32x4_t v1555 = *(const float32x4_t *)v1237; - float32x4_t v1557 = *(const float32x4_t *)v1246; - float32x4_t v1559 = *(const float32x4_t *)v1255; - float32x4_t v1561 = *(const float32x4_t *)v1264; - float32x4_t v1563 = *(const float32x4_t *)v1273; - float32x4_t v1565 = *(const float32x4_t *)v1282; - float32x4_t v1567 = *(const float32x4_t *)v1291; - float32x4_t v1571 = *(const float32x4_t *)v1309; - float32x4_t v1573 = *(const float32x4_t *)v1318; - float32x4_t v1575 = *(const float32x4_t *)v1327; - float32x4_t v1577 = *(const float32x4_t *)v1336; - float32x4_t v1579 = *(const float32x4_t *)v1345; + float32x4_t v1539 = vld1q_f32((const float32_t *)v1164); + float32x4_t v1541 = vld1q_f32((const float32_t *)v1173); + float32x4_t v1545 = vld1q_f32((const float32_t *)v1192); + float32x4_t v1547 = vld1q_f32((const float32_t *)v1201); + float32x4_t v1549 = vld1q_f32((const float32_t *)v1210); + float32x4_t v1551 = vld1q_f32((const float32_t *)v1219); + float32x4_t v1553 = vld1q_f32((const float32_t *)v1228); + float32x4_t v1555 = vld1q_f32((const float32_t *)v1237); + float32x4_t v1557 = vld1q_f32((const float32_t *)v1246); + float32x4_t v1559 = vld1q_f32((const float32_t *)v1255); + float32x4_t v1561 = vld1q_f32((const float32_t *)v1264); + float32x4_t v1563 = vld1q_f32((const float32_t *)v1273); + float32x4_t v1565 = vld1q_f32((const float32_t *)v1282); + float32x4_t v1567 = vld1q_f32((const float32_t *)v1291); + float32x4_t v1571 = vld1q_f32((const float32_t *)v1309); + float32x4_t v1573 = vld1q_f32((const float32_t *)v1318); + float32x4_t v1575 = vld1q_f32((const float32_t *)v1327); + float32x4_t v1577 = vld1q_f32((const float32_t *)v1336); + float32x4_t v1579 = vld1q_f32((const float32_t *)v1345); float32x4_t v35 = vaddq_f32(v1539, v1541); float32x4_t v36 = vsubq_f32(v1539, v1541); float32x4_t v59 = vaddq_f32(v1545, v1547); @@ -11690,7 +11690,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu21(const armral_cmplx_f32_t *restrict x, float32x4_t v462 = vsubq_f32(v461, v451); float32x4_t v464 = vaddq_f32(v463, v451); float32x4_t v471 = vaddq_f32(v197, v303); - *(float32x4_t *)v1355 = v197; + vst1q_f32((float32_t *)v1355, v197); float32x4_t v264 = vaddq_f32(v263, v220); float32x4_t v266 = vsubq_f32(v263, v220); float32x4_t v268 = vsubq_f32(v263, v225); @@ -11718,8 +11718,8 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu21(const armral_cmplx_f32_t *restrict x, float32x4_t v453 = vaddq_f32(v452, v415); float32x4_t v455 = vsubq_f32(v452, v415); float32x4_t v457 = vsubq_f32(v452, v423); - *(float32x4_t *)v1364 = v473; - *(float32x4_t *)v1373 = v472; + vst1q_f32((float32_t *)v1364, v473); + vst1q_f32((float32_t *)v1373, v472); float32x4_t v276 = vaddq_f32(v265, v271); float32x4_t v277 = vsubq_f32(v265, v271); float32x4_t v278 = vaddq_f32(v267, v273); @@ -11747,12 +11747,12 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu21(const armral_cmplx_f32_t *restrict x, float32x4_t v567 = vaddq_f32(v281, v374); float32x4_t v591 = vaddq_f32(v278, v371); float32x4_t v615 = vaddq_f32(v276, v369); - *(float32x4_t *)v1382 = v277; - *(float32x4_t *)v1409 = v279; - *(float32x4_t *)v1436 = v280; - *(float32x4_t *)v1463 = v281; - *(float32x4_t *)v1490 = v278; - *(float32x4_t *)v1517 = v276; + vst1q_f32((float32_t *)v1382, v277); + vst1q_f32((float32_t *)v1409, v279); + vst1q_f32((float32_t *)v1436, v280); + vst1q_f32((float32_t *)v1463, v281); + vst1q_f32((float32_t *)v1490, v278); + vst1q_f32((float32_t *)v1517, v276); float32x4_t v496 = vaddq_f32(v495, v466); float32x4_t v497 = vsubq_f32(v495, v466); float32x4_t v520 = vaddq_f32(v519, v468); @@ -11765,18 +11765,18 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu21(const armral_cmplx_f32_t *restrict x, float32x4_t v593 = vsubq_f32(v591, v467); float32x4_t v616 = vaddq_f32(v615, v465); float32x4_t v617 = vsubq_f32(v615, v465); - *(float32x4_t *)v1391 = v497; - *(float32x4_t *)v1400 = v496; - *(float32x4_t *)v1418 = v521; - *(float32x4_t *)v1427 = v520; - *(float32x4_t *)v1445 = v545; - *(float32x4_t *)v1454 = v544; - *(float32x4_t *)v1472 = v569; - *(float32x4_t *)v1481 = v568; - *(float32x4_t *)v1499 = v593; - *(float32x4_t *)v1508 = v592; - *(float32x4_t *)v1526 = v617; - *(float32x4_t *)v1535 = v616; + vst1q_f32((float32_t *)v1391, v497); + vst1q_f32((float32_t *)v1400, v496); + vst1q_f32((float32_t *)v1418, v521); + vst1q_f32((float32_t *)v1427, v520); + vst1q_f32((float32_t *)v1445, v545); + vst1q_f32((float32_t *)v1454, v544); + vst1q_f32((float32_t *)v1472, v569); + vst1q_f32((float32_t *)v1481, v568); + vst1q_f32((float32_t *)v1499, v593); + vst1q_f32((float32_t *)v1508, v592); + vst1q_f32((float32_t *)v1526, v617); + vst1q_f32((float32_t *)v1535, v616); v5 += 2 * 1; v6 += 2 * 1; } @@ -12713,7 +12713,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu22(const armral_cmplx_f32_t *restrict x, float32x2_t v582 = (float32x2_t){v580, v581}; const float32x2_t *v1446 = &v5[0]; float32x2_t *v1645 = &v6[0]; - float32x4_t v1864 = *(const float32x4_t *)v1563; + float32x4_t v1864 = vld1q_f32((const float32_t *)v1563); float32x4_t v461 = vcombine_f32(v460, v460); float32x2_t v467 = vmul_f32(v583, v465); float32x4_t v474 = vcombine_f32(v473, v473); @@ -12774,7 +12774,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu22(const armral_cmplx_f32_t *restrict x, float32x2_t *v1816 = &v6[ostride * 9]; float32x2_t *v1825 = &v6[ostride * 10]; float32x2_t *v1834 = &v6[ostride * 21]; - float32x4_t v1838 = *(const float32x4_t *)v1446; + float32x4_t v1838 = vld1q_f32((const float32_t *)v1446); float32x4_t v469 = vcombine_f32(v467, v467); float32x4_t v522 = vcombine_f32(v520, v520); float32x4_t v530 = vcombine_f32(v528, v528); @@ -12785,26 +12785,26 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu22(const armral_cmplx_f32_t *restrict x, float32x4_t v570 = vcombine_f32(v568, v568); float32x4_t v578 = vcombine_f32(v576, v576); float32x4_t v586 = vcombine_f32(v584, v584); - float32x4_t v1840 = *(const float32x4_t *)v1455; - float32x4_t v1842 = *(const float32x4_t *)v1464; - float32x4_t v1844 = *(const float32x4_t *)v1473; - float32x4_t v1846 = *(const float32x4_t *)v1482; - float32x4_t v1848 = *(const float32x4_t *)v1491; - float32x4_t v1850 = *(const float32x4_t *)v1500; - float32x4_t v1852 = *(const float32x4_t *)v1509; - float32x4_t v1854 = *(const float32x4_t *)v1518; - float32x4_t v1856 = *(const float32x4_t *)v1527; - float32x4_t v1858 = *(const float32x4_t *)v1536; - float32x4_t v1860 = *(const float32x4_t *)v1545; - float32x4_t v1862 = *(const float32x4_t *)v1554; - float32x4_t v1866 = *(const float32x4_t *)v1572; - float32x4_t v1868 = *(const float32x4_t *)v1581; - float32x4_t v1870 = *(const float32x4_t *)v1590; - float32x4_t v1872 = *(const float32x4_t *)v1599; - float32x4_t v1874 = *(const float32x4_t *)v1608; - float32x4_t v1876 = *(const float32x4_t *)v1617; - float32x4_t v1878 = *(const float32x4_t *)v1626; - float32x4_t v1880 = *(const float32x4_t *)v1635; + float32x4_t v1840 = vld1q_f32((const float32_t *)v1455); + float32x4_t v1842 = vld1q_f32((const float32_t *)v1464); + float32x4_t v1844 = vld1q_f32((const float32_t *)v1473); + float32x4_t v1846 = vld1q_f32((const float32_t *)v1482); + float32x4_t v1848 = vld1q_f32((const float32_t *)v1491); + float32x4_t v1850 = vld1q_f32((const float32_t *)v1500); + float32x4_t v1852 = vld1q_f32((const float32_t *)v1509); + float32x4_t v1854 = vld1q_f32((const float32_t *)v1518); + float32x4_t v1856 = vld1q_f32((const float32_t *)v1527); + float32x4_t v1858 = vld1q_f32((const float32_t *)v1536); + float32x4_t v1860 = vld1q_f32((const float32_t *)v1545); + float32x4_t v1862 = vld1q_f32((const float32_t *)v1554); + float32x4_t v1866 = vld1q_f32((const float32_t *)v1572); + float32x4_t v1868 = vld1q_f32((const float32_t *)v1581); + float32x4_t v1870 = vld1q_f32((const float32_t *)v1590); + float32x4_t v1872 = vld1q_f32((const float32_t *)v1599); + float32x4_t v1874 = vld1q_f32((const float32_t *)v1608); + float32x4_t v1876 = vld1q_f32((const float32_t *)v1617); + float32x4_t v1878 = vld1q_f32((const float32_t *)v1626); + float32x4_t v1880 = vld1q_f32((const float32_t *)v1635); float32x4_t v35 = vaddq_f32(v1838, v1840); float32x4_t v36 = vsubq_f32(v1838, v1840); float32x4_t v51 = vaddq_f32(v1842, v1844); @@ -12995,8 +12995,8 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu22(const armral_cmplx_f32_t *restrict x, float32x4_t v602 = vsubq_f32(v571, v587); float32x4_t v603 = vaddq_f32(v593, v595); float32x4_t v621 = vaddq_f32(v597, v598); - *(float32x4_t *)v1645 = v213; - *(float32x4_t *)v1654 = v432; + vst1q_f32((float32_t *)v1645, v213); + vst1q_f32((float32_t *)v1654, v432); float32x4_t v385 = vaddq_f32(v384, v369); float32x4_t v386 = vsubq_f32(v369, v371); float32x4_t v388 = vaddq_f32(v369, v375); @@ -13057,26 +13057,26 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu22(const armral_cmplx_f32_t *restrict x, float32x4_t v415 = vsubq_f32(v393, v405); float32x4_t v625 = vaddq_f32(v612, v624); float32x4_t v634 = vsubq_f32(v612, v624); - *(float32x4_t *)v1681 = v414; - *(float32x4_t *)v1690 = v633; - *(float32x4_t *)v1699 = v413; - *(float32x4_t *)v1708 = v632; - *(float32x4_t *)v1717 = v412; - *(float32x4_t *)v1726 = v631; - *(float32x4_t *)v1735 = v411; - *(float32x4_t *)v1744 = v630; - *(float32x4_t *)v1753 = v410; - *(float32x4_t *)v1762 = v629; - *(float32x4_t *)v1771 = v409; - *(float32x4_t *)v1780 = v628; - *(float32x4_t *)v1789 = v408; - *(float32x4_t *)v1798 = v627; - *(float32x4_t *)v1807 = v407; - *(float32x4_t *)v1816 = v626; - *(float32x4_t *)v1663 = v415; - *(float32x4_t *)v1672 = v634; - *(float32x4_t *)v1825 = v406; - *(float32x4_t *)v1834 = v625; + vst1q_f32((float32_t *)v1681, v414); + vst1q_f32((float32_t *)v1690, v633); + vst1q_f32((float32_t *)v1699, v413); + vst1q_f32((float32_t *)v1708, v632); + vst1q_f32((float32_t *)v1717, v412); + vst1q_f32((float32_t *)v1726, v631); + vst1q_f32((float32_t *)v1735, v411); + vst1q_f32((float32_t *)v1744, v630); + vst1q_f32((float32_t *)v1753, v410); + vst1q_f32((float32_t *)v1762, v629); + vst1q_f32((float32_t *)v1771, v409); + vst1q_f32((float32_t *)v1780, v628); + vst1q_f32((float32_t *)v1789, v408); + vst1q_f32((float32_t *)v1798, v627); + vst1q_f32((float32_t *)v1807, v407); + vst1q_f32((float32_t *)v1816, v626); + vst1q_f32((float32_t *)v1663, v415); + vst1q_f32((float32_t *)v1672, v634); + vst1q_f32((float32_t *)v1825, v406); + vst1q_f32((float32_t *)v1834, v625); v5 += 2 * 1; v6 += 2 * 1; } @@ -14121,7 +14121,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu24(const armral_cmplx_f32_t *restrict x, float32x2_t v428 = (float32x2_t){v426, v427}; const float32x2_t *v1168 = &v5[0]; float32x2_t *v1367 = &v6[0]; - float32x4_t v1598 = *(const float32x4_t *)v1240; + float32x4_t v1598 = vld1q_f32((const float32_t *)v1240); float32x2_t v261 = vmul_f32(v429, v259); float32x2_t v269 = vmul_f32(v429, v267); float32x4_t v276 = vcombine_f32(v275, v275); @@ -14177,35 +14177,35 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu24(const armral_cmplx_f32_t *restrict x, float32x2_t *v1556 = &v6[ostride * 15]; float32x2_t *v1565 = &v6[ostride * 7]; float32x2_t *v1574 = &v6[ostride * 23]; - float32x4_t v1582 = *(const float32x4_t *)v1168; + float32x4_t v1582 = vld1q_f32((const float32_t *)v1168); float32x4_t v263 = vcombine_f32(v261, v261); float32x4_t v271 = vcombine_f32(v269, v269); float32x4_t v338 = vcombine_f32(v336, v336); float32x4_t v346 = vcombine_f32(v344, v344); float32x4_t v414 = vcombine_f32(v412, v412); float32x4_t v432 = vcombine_f32(v430, v430); - float32x4_t v1578 = *(const float32x4_t *)v1149; - float32x4_t v1580 = *(const float32x4_t *)v1158; - float32x4_t v1584 = *(const float32x4_t *)v1177; - float32x4_t v1586 = *(const float32x4_t *)v1186; - float32x4_t v1588 = *(const float32x4_t *)v1195; - float32x4_t v1590 = *(const float32x4_t *)v1204; - float32x4_t v1592 = *(const float32x4_t *)v1213; - float32x4_t v1594 = *(const float32x4_t *)v1222; - float32x4_t v1596 = *(const float32x4_t *)v1231; - float32x4_t v1600 = *(const float32x4_t *)v1249; - float32x4_t v1602 = *(const float32x4_t *)v1258; - float32x4_t v1604 = *(const float32x4_t *)v1267; - float32x4_t v1606 = *(const float32x4_t *)v1276; - float32x4_t v1608 = *(const float32x4_t *)v1285; - float32x4_t v1610 = *(const float32x4_t *)v1294; - float32x4_t v1612 = *(const float32x4_t *)v1303; - float32x4_t v1614 = *(const float32x4_t *)v1312; - float32x4_t v1616 = *(const float32x4_t *)v1321; - float32x4_t v1618 = *(const float32x4_t *)v1330; - float32x4_t v1620 = *(const float32x4_t *)v1339; - float32x4_t v1622 = *(const float32x4_t *)v1348; - float32x4_t v1624 = *(const float32x4_t *)v1357; + float32x4_t v1578 = vld1q_f32((const float32_t *)v1149); + float32x4_t v1580 = vld1q_f32((const float32_t *)v1158); + float32x4_t v1584 = vld1q_f32((const float32_t *)v1177); + float32x4_t v1586 = vld1q_f32((const float32_t *)v1186); + float32x4_t v1588 = vld1q_f32((const float32_t *)v1195); + float32x4_t v1590 = vld1q_f32((const float32_t *)v1204); + float32x4_t v1592 = vld1q_f32((const float32_t *)v1213); + float32x4_t v1594 = vld1q_f32((const float32_t *)v1222); + float32x4_t v1596 = vld1q_f32((const float32_t *)v1231); + float32x4_t v1600 = vld1q_f32((const float32_t *)v1249); + float32x4_t v1602 = vld1q_f32((const float32_t *)v1258); + float32x4_t v1604 = vld1q_f32((const float32_t *)v1267); + float32x4_t v1606 = vld1q_f32((const float32_t *)v1276); + float32x4_t v1608 = vld1q_f32((const float32_t *)v1285); + float32x4_t v1610 = vld1q_f32((const float32_t *)v1294); + float32x4_t v1612 = vld1q_f32((const float32_t *)v1303); + float32x4_t v1614 = vld1q_f32((const float32_t *)v1312); + float32x4_t v1616 = vld1q_f32((const float32_t *)v1321); + float32x4_t v1618 = vld1q_f32((const float32_t *)v1330); + float32x4_t v1620 = vld1q_f32((const float32_t *)v1339); + float32x4_t v1622 = vld1q_f32((const float32_t *)v1348); + float32x4_t v1624 = vld1q_f32((const float32_t *)v1357); float32x4_t v35 = vaddq_f32(v1578, v1580); float32x4_t v36 = vsubq_f32(v1578, v1580); float32x4_t v59 = vaddq_f32(v1584, v1586); @@ -14325,8 +14325,8 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu24(const armral_cmplx_f32_t *restrict x, float32x4_t v437 = vsubq_f32(v415, v433); float32x4_t v444 = vaddq_f32(v225, v308); float32x4_t v540 = vaddq_f32(v226, v313); - *(float32x4_t *)v1367 = v225; - *(float32x4_t *)v1475 = v226; + vst1q_f32((float32_t *)v1367, v225); + vst1q_f32((float32_t *)v1475, v226); float32x4_t v278 = vaddq_f32(v222, v251); float32x4_t v279 = vsubq_f32(v222, v251); float32x4_t v282 = vaddq_f32(v264, v272); @@ -14349,12 +14349,12 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu24(const armral_cmplx_f32_t *restrict x, float32x4_t v287 = vsubq_f32(v281, v283); float32x4_t v492 = vaddq_f32(v279, v354); float32x4_t v588 = vaddq_f32(v278, v353); - *(float32x4_t *)v1376 = v446; - *(float32x4_t *)v1385 = v445; - *(float32x4_t *)v1421 = v279; - *(float32x4_t *)v1484 = v542; - *(float32x4_t *)v1493 = v541; - *(float32x4_t *)v1529 = v278; + vst1q_f32((float32_t *)v1376, v446); + vst1q_f32((float32_t *)v1385, v445); + vst1q_f32((float32_t *)v1421, v279); + vst1q_f32((float32_t *)v1484, v542); + vst1q_f32((float32_t *)v1493, v541); + vst1q_f32((float32_t *)v1529, v278); float32x4_t v468 = vaddq_f32(v285, v360); float32x4_t v493 = vaddq_f32(v492, v435); float32x4_t v494 = vsubq_f32(v492, v435); @@ -14363,10 +14363,10 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu24(const armral_cmplx_f32_t *restrict x, float32x4_t v589 = vaddq_f32(v588, v434); float32x4_t v590 = vsubq_f32(v588, v434); float32x4_t v612 = vaddq_f32(v284, v359); - *(float32x4_t *)v1394 = v285; - *(float32x4_t *)v1448 = v286; - *(float32x4_t *)v1502 = v287; - *(float32x4_t *)v1556 = v284; + vst1q_f32((float32_t *)v1394, v285); + vst1q_f32((float32_t *)v1448, v286); + vst1q_f32((float32_t *)v1502, v287); + vst1q_f32((float32_t *)v1556, v284); float32x4_t v469 = vaddq_f32(v468, v441); float32x4_t v470 = vsubq_f32(v468, v441); float32x4_t v517 = vaddq_f32(v516, v442); @@ -14375,18 +14375,18 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu24(const armral_cmplx_f32_t *restrict x, float32x4_t v566 = vsubq_f32(v564, v443); float32x4_t v613 = vaddq_f32(v612, v440); float32x4_t v614 = vsubq_f32(v612, v440); - *(float32x4_t *)v1430 = v494; - *(float32x4_t *)v1439 = v493; - *(float32x4_t *)v1538 = v590; - *(float32x4_t *)v1547 = v589; - *(float32x4_t *)v1403 = v470; - *(float32x4_t *)v1412 = v469; - *(float32x4_t *)v1457 = v518; - *(float32x4_t *)v1466 = v517; - *(float32x4_t *)v1511 = v566; - *(float32x4_t *)v1520 = v565; - *(float32x4_t *)v1565 = v614; - *(float32x4_t *)v1574 = v613; + vst1q_f32((float32_t *)v1430, v494); + vst1q_f32((float32_t *)v1439, v493); + vst1q_f32((float32_t *)v1538, v590); + vst1q_f32((float32_t *)v1547, v589); + vst1q_f32((float32_t *)v1403, v470); + vst1q_f32((float32_t *)v1412, v469); + vst1q_f32((float32_t *)v1457, v518); + vst1q_f32((float32_t *)v1466, v517); + vst1q_f32((float32_t *)v1511, v566); + vst1q_f32((float32_t *)v1520, v565); + vst1q_f32((float32_t *)v1565, v614); + vst1q_f32((float32_t *)v1574, v613); v5 += 2 * 1; v6 += 2 * 1; } @@ -15191,7 +15191,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1690 = (float32x2_t){v1689, v1689}; const float32x2_t *v3110 = &v5[0]; float32x2_t *v3336 = &v6[0]; - float32x4_t v3566 = *(const float32x4_t *)v3155; + float32x4_t v3566 = vld1q_f32((const float32_t *)v3155); float32x2_t v917 = (float32x2_t){v916, v919}; float32x4_t v1032 = vcombine_f32(v1031, v1031); float32x2_t v1038 = vmul_f32(v1662, v1036); @@ -15261,7 +15261,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t *v3534 = &v6[ostride * 14]; float32x2_t *v3543 = &v6[ostride * 19]; float32x2_t *v3552 = &v6[ostride * 24]; - float32x4_t v3556 = *(const float32x4_t *)v3110; + float32x4_t v3556 = vld1q_f32((const float32_t *)v3110); float32x4_t v921 = vcombine_f32(v917, v917); float32x4_t v1040 = vcombine_f32(v1038, v1038); float32x4_t v1208 = vcombine_f32(v1206, v1206); @@ -15273,29 +15273,29 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1572 = vcombine_f32(v1570, v1570); float32x4_t v1593 = vcombine_f32(v1591, v1591); float32x4_t v1665 = vcombine_f32(v1663, v1663); - float32x4_t v3558 = *(const float32x4_t *)v3119; - float32x4_t v3560 = *(const float32x4_t *)v3128; - float32x4_t v3562 = *(const float32x4_t *)v3137; - float32x4_t v3564 = *(const float32x4_t *)v3146; - float32x4_t v3568 = *(const float32x4_t *)v3164; - float32x4_t v3570 = *(const float32x4_t *)v3173; - float32x4_t v3572 = *(const float32x4_t *)v3182; - float32x4_t v3574 = *(const float32x4_t *)v3191; - float32x4_t v3576 = *(const float32x4_t *)v3200; - float32x4_t v3578 = *(const float32x4_t *)v3209; - float32x4_t v3580 = *(const float32x4_t *)v3218; - float32x4_t v3582 = *(const float32x4_t *)v3227; - float32x4_t v3584 = *(const float32x4_t *)v3236; - float32x4_t v3586 = *(const float32x4_t *)v3245; - float32x4_t v3588 = *(const float32x4_t *)v3254; - float32x4_t v3590 = *(const float32x4_t *)v3263; - float32x4_t v3592 = *(const float32x4_t *)v3272; - float32x4_t v3594 = *(const float32x4_t *)v3281; - float32x4_t v3596 = *(const float32x4_t *)v3290; - float32x4_t v3598 = *(const float32x4_t *)v3299; - float32x4_t v3600 = *(const float32x4_t *)v3308; - float32x4_t v3602 = *(const float32x4_t *)v3317; - float32x4_t v3604 = *(const float32x4_t *)v3326; + float32x4_t v3558 = vld1q_f32((const float32_t *)v3119); + float32x4_t v3560 = vld1q_f32((const float32_t *)v3128); + float32x4_t v3562 = vld1q_f32((const float32_t *)v3137); + float32x4_t v3564 = vld1q_f32((const float32_t *)v3146); + float32x4_t v3568 = vld1q_f32((const float32_t *)v3164); + float32x4_t v3570 = vld1q_f32((const float32_t *)v3173); + float32x4_t v3572 = vld1q_f32((const float32_t *)v3182); + float32x4_t v3574 = vld1q_f32((const float32_t *)v3191); + float32x4_t v3576 = vld1q_f32((const float32_t *)v3200); + float32x4_t v3578 = vld1q_f32((const float32_t *)v3209); + float32x4_t v3580 = vld1q_f32((const float32_t *)v3218); + float32x4_t v3582 = vld1q_f32((const float32_t *)v3227); + float32x4_t v3584 = vld1q_f32((const float32_t *)v3236); + float32x4_t v3586 = vld1q_f32((const float32_t *)v3245); + float32x4_t v3588 = vld1q_f32((const float32_t *)v3254); + float32x4_t v3590 = vld1q_f32((const float32_t *)v3263); + float32x4_t v3592 = vld1q_f32((const float32_t *)v3272); + float32x4_t v3594 = vld1q_f32((const float32_t *)v3281); + float32x4_t v3596 = vld1q_f32((const float32_t *)v3290); + float32x4_t v3598 = vld1q_f32((const float32_t *)v3299); + float32x4_t v3600 = vld1q_f32((const float32_t *)v3308); + float32x4_t v3602 = vld1q_f32((const float32_t *)v3317); + float32x4_t v3604 = vld1q_f32((const float32_t *)v3326); float32x4_t v66 = vrev64q_f32(v3558); float32x4_t v80 = vrev64q_f32(v3560); float32x4_t v94 = vrev64q_f32(v3564); @@ -15597,7 +15597,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1560 = vfmaq_f32(v1559, v524, v1550); float32x4_t v1574 = vfmaq_f32(v1573, v860, v1564); float32x4_t v1595 = vfmaq_f32(v1594, v692, v1585); - *(float32x4_t *)v3336 = v963; + vst1q_f32((float32_t *)v3336, v963); float32x4_t v950 = vsubq_f32(v938, v949); float32x4_t v955 = vmulq_f32(v938, v1691); float32x4_t v1077 = vsubq_f32(v1076, v1071); @@ -15656,8 +15656,8 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1604 = vsubq_f32(v1581, v1602); float32x4_t v1616 = vaddq_f32(v1575, v1615); float32x4_t v1634 = vsubq_f32(v1633, v1596); - *(float32x4_t *)v3354 = v995; - *(float32x4_t *)v3426 = v1299; + vst1q_f32((float32_t *)v3354, v995); + vst1q_f32((float32_t *)v3426, v1299); float32x4_t v1021 = vsubq_f32(v1020, v979); float32x4_t v1106 = vsubq_f32(v167, v1105); float32x4_t v1146 = vmulq_f32(v1144, v1665); @@ -15672,10 +15672,10 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1635 = vaddq_f32(v188, v1603); float32x4_t v1648 = vrev64q_f32(v1616); float32x4_t v1664 = vrev64q_f32(v1634); - *(float32x4_t *)v3345 = v979; - *(float32x4_t *)v3363 = v1008; - *(float32x4_t *)v3381 = v1131; - *(float32x4_t *)v3471 = v1467; + vst1q_f32((float32_t *)v3345, v979); + vst1q_f32((float32_t *)v3363, v1008); + vst1q_f32((float32_t *)v3381, v1131); + vst1q_f32((float32_t *)v3471, v1467); float32x4_t v1118 = vsubq_f32(v1106, v1117); float32x4_t v1123 = vmulq_f32(v1106, v1691); float32x4_t v1292 = vsubq_f32(v1291, v1286); @@ -15686,8 +15686,8 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1610 = vsubq_f32(v188, v1609); float32x4_t v1650 = vmulq_f32(v1648, v1665); float32x4_t v1666 = vmulq_f32(v1664, v1665); - *(float32x4_t *)v3372 = v1021; - *(float32x4_t *)v3516 = v1635; + vst1q_f32((float32_t *)v3372, v1021); + vst1q_f32((float32_t *)v3516, v1635); float32x4_t v1124 = vsubq_f32(v1123, v1118); float32x4_t v1163 = vsubq_f32(v1118, v1162); float32x4_t v1175 = vmulq_f32(v1118, v1691); @@ -15699,7 +15699,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1511 = vmulq_f32(v1454, v1691); float32x4_t v1622 = vsubq_f32(v1610, v1621); float32x4_t v1627 = vmulq_f32(v1610, v1691); - *(float32x4_t *)v3444 = v1331; + vst1q_f32((float32_t *)v3444, v1331); float32x4_t v1147 = vsubq_f32(v1124, v1146); float32x4_t v1176 = vsubq_f32(v1175, v1163); float32x4_t v1188 = vmulq_f32(v1124, v1691); @@ -15710,27 +15710,27 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1628 = vsubq_f32(v1627, v1622); float32x4_t v1667 = vsubq_f32(v1622, v1666); float32x4_t v1679 = vmulq_f32(v1622, v1691); - *(float32x4_t *)v3399 = v1163; - *(float32x4_t *)v3435 = v1315; - *(float32x4_t *)v3453 = v1344; - *(float32x4_t *)v3489 = v1499; + vst1q_f32((float32_t *)v3399, v1163); + vst1q_f32((float32_t *)v3435, v1315); + vst1q_f32((float32_t *)v3453, v1344); + vst1q_f32((float32_t *)v3489, v1499); float32x4_t v1189 = vsubq_f32(v1188, v1147); float32x4_t v1525 = vsubq_f32(v1524, v1483); float32x4_t v1651 = vsubq_f32(v1628, v1650); float32x4_t v1680 = vsubq_f32(v1679, v1667); float32x4_t v1692 = vmulq_f32(v1628, v1691); - *(float32x4_t *)v3390 = v1147; - *(float32x4_t *)v3408 = v1176; - *(float32x4_t *)v3462 = v1357; - *(float32x4_t *)v3480 = v1483; - *(float32x4_t *)v3498 = v1512; - *(float32x4_t *)v3534 = v1667; + vst1q_f32((float32_t *)v3390, v1147); + vst1q_f32((float32_t *)v3408, v1176); + vst1q_f32((float32_t *)v3462, v1357); + vst1q_f32((float32_t *)v3480, v1483); + vst1q_f32((float32_t *)v3498, v1512); + vst1q_f32((float32_t *)v3534, v1667); float32x4_t v1693 = vsubq_f32(v1692, v1651); - *(float32x4_t *)v3417 = v1189; - *(float32x4_t *)v3507 = v1525; - *(float32x4_t *)v3525 = v1651; - *(float32x4_t *)v3543 = v1680; - *(float32x4_t *)v3552 = v1693; + vst1q_f32((float32_t *)v3417, v1189); + vst1q_f32((float32_t *)v3507, v1525); + vst1q_f32((float32_t *)v3525, v1651); + vst1q_f32((float32_t *)v3543, v1680); + vst1q_f32((float32_t *)v3552, v1693); v5 += 2 * 1; v6 += 2 * 1; } @@ -16933,7 +16933,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu32(const armral_cmplx_f32_t *restrict x, float32x2_t v1139 = (float32x2_t){v1137, v1138}; const float32x2_t *v2149 = &v5[0]; float32x2_t *v2438 = &v6[0]; - float32x4_t v2753 = *(const float32x4_t *)v2293; + float32x4_t v2753 = vld1q_f32((const float32_t *)v2293); float32x4_t v690 = vcombine_f32(v689, v689); float32x4_t v760 = vcombine_f32(v759, v759); float32x2_t v766 = vmul_f32(v1140, v764); @@ -17016,7 +17016,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu32(const armral_cmplx_f32_t *restrict x, float32x2_t *v2699 = &v6[ostride * 15]; float32x2_t *v2708 = &v6[ostride * 23]; float32x2_t *v2717 = &v6[ostride * 31]; - float32x4_t v2721 = *(const float32x4_t *)v2149; + float32x4_t v2721 = vld1q_f32((const float32_t *)v2149); float32x4_t v768 = vcombine_f32(v766, v766); float32x4_t v838 = vcombine_f32(v836, v836); float32x4_t v921 = vcombine_f32(v919, v919); @@ -17027,36 +17027,36 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu32(const armral_cmplx_f32_t *restrict x, float32x4_t v1118 = vcombine_f32(v1116, v1116); float32x4_t v1131 = vcombine_f32(v1129, v1129); float32x4_t v1143 = vcombine_f32(v1141, v1141); - float32x4_t v2723 = *(const float32x4_t *)v2158; - float32x4_t v2725 = *(const float32x4_t *)v2167; - float32x4_t v2727 = *(const float32x4_t *)v2176; - float32x4_t v2729 = *(const float32x4_t *)v2185; - float32x4_t v2731 = *(const float32x4_t *)v2194; - float32x4_t v2733 = *(const float32x4_t *)v2203; - float32x4_t v2735 = *(const float32x4_t *)v2212; - float32x4_t v2737 = *(const float32x4_t *)v2221; - float32x4_t v2739 = *(const float32x4_t *)v2230; - float32x4_t v2741 = *(const float32x4_t *)v2239; - float32x4_t v2743 = *(const float32x4_t *)v2248; - float32x4_t v2745 = *(const float32x4_t *)v2257; - float32x4_t v2747 = *(const float32x4_t *)v2266; - float32x4_t v2749 = *(const float32x4_t *)v2275; - float32x4_t v2751 = *(const float32x4_t *)v2284; - float32x4_t v2755 = *(const float32x4_t *)v2302; - float32x4_t v2757 = *(const float32x4_t *)v2311; - float32x4_t v2759 = *(const float32x4_t *)v2320; - float32x4_t v2761 = *(const float32x4_t *)v2329; - float32x4_t v2763 = *(const float32x4_t *)v2338; - float32x4_t v2765 = *(const float32x4_t *)v2347; - float32x4_t v2767 = *(const float32x4_t *)v2356; - float32x4_t v2769 = *(const float32x4_t *)v2365; - float32x4_t v2771 = *(const float32x4_t *)v2374; - float32x4_t v2773 = *(const float32x4_t *)v2383; - float32x4_t v2775 = *(const float32x4_t *)v2392; - float32x4_t v2777 = *(const float32x4_t *)v2401; - float32x4_t v2779 = *(const float32x4_t *)v2410; - float32x4_t v2781 = *(const float32x4_t *)v2419; - float32x4_t v2783 = *(const float32x4_t *)v2428; + float32x4_t v2723 = vld1q_f32((const float32_t *)v2158); + float32x4_t v2725 = vld1q_f32((const float32_t *)v2167); + float32x4_t v2727 = vld1q_f32((const float32_t *)v2176); + float32x4_t v2729 = vld1q_f32((const float32_t *)v2185); + float32x4_t v2731 = vld1q_f32((const float32_t *)v2194); + float32x4_t v2733 = vld1q_f32((const float32_t *)v2203); + float32x4_t v2735 = vld1q_f32((const float32_t *)v2212); + float32x4_t v2737 = vld1q_f32((const float32_t *)v2221); + float32x4_t v2739 = vld1q_f32((const float32_t *)v2230); + float32x4_t v2741 = vld1q_f32((const float32_t *)v2239); + float32x4_t v2743 = vld1q_f32((const float32_t *)v2248); + float32x4_t v2745 = vld1q_f32((const float32_t *)v2257); + float32x4_t v2747 = vld1q_f32((const float32_t *)v2266); + float32x4_t v2749 = vld1q_f32((const float32_t *)v2275); + float32x4_t v2751 = vld1q_f32((const float32_t *)v2284); + float32x4_t v2755 = vld1q_f32((const float32_t *)v2302); + float32x4_t v2757 = vld1q_f32((const float32_t *)v2311); + float32x4_t v2759 = vld1q_f32((const float32_t *)v2320); + float32x4_t v2761 = vld1q_f32((const float32_t *)v2329); + float32x4_t v2763 = vld1q_f32((const float32_t *)v2338); + float32x4_t v2765 = vld1q_f32((const float32_t *)v2347); + float32x4_t v2767 = vld1q_f32((const float32_t *)v2356); + float32x4_t v2769 = vld1q_f32((const float32_t *)v2365); + float32x4_t v2771 = vld1q_f32((const float32_t *)v2374); + float32x4_t v2773 = vld1q_f32((const float32_t *)v2383); + float32x4_t v2775 = vld1q_f32((const float32_t *)v2392); + float32x4_t v2777 = vld1q_f32((const float32_t *)v2401); + float32x4_t v2779 = vld1q_f32((const float32_t *)v2410); + float32x4_t v2781 = vld1q_f32((const float32_t *)v2419); + float32x4_t v2783 = vld1q_f32((const float32_t *)v2428); float32x4_t v35 = vaddq_f32(v2721, v2723); float32x4_t v36 = vsubq_f32(v2721, v2723); float32x4_t v51 = vaddq_f32(v2725, v2727); @@ -17217,8 +17217,8 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu32(const armral_cmplx_f32_t *restrict x, float32x4_t v1047 = vrev64q_f32(v470); float32x4_t v1054 = vmulq_f32(v602, v1053); float32x4_t v1060 = vrev64q_f32(v602); - *(float32x4_t *)v2438 = v655; - *(float32x4_t *)v2456 = v656; + vst1q_f32((float32_t *)v2438, v655); + vst1q_f32((float32_t *)v2456, v656); float32x4_t v146 = vrev64q_f32(v140); float32x4_t v149 = vaddq_f32(v63, v139); float32x4_t v150 = vsubq_f32(v63, v139); @@ -17262,8 +17262,8 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu32(const armral_cmplx_f32_t *restrict x, float32x4_t v990 = vrev64q_f32(v642); float32x4_t v1063 = vfmaq_f32(v1041, v1047, v1048); float32x4_t v1064 = vfmaq_f32(v1054, v1060, v1061); - *(float32x4_t *)v2447 = v657; - *(float32x4_t *)v2465 = v658; + vst1q_f32((float32_t *)v2447, v657); + vst1q_f32((float32_t *)v2465, v658); float32x4_t v151 = vsubq_f32(v64, v148); float32x4_t v152 = vaddq_f32(v64, v148); float32x4_t v290 = vrev64q_f32(v284); @@ -17305,8 +17305,8 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu32(const armral_cmplx_f32_t *restrict x, float32x4_t v1117 = vrev64q_f32(v512); float32x4_t v1124 = vmulq_f32(v644, v1123); float32x4_t v1130 = vrev64q_f32(v644); - *(float32x4_t *)v2582 = v935; - *(float32x4_t *)v2600 = v936; + vst1q_f32((float32_t *)v2582, v935); + vst1q_f32((float32_t *)v2600, v936); float32x4_t v295 = vsubq_f32(v150, v292); float32x4_t v296 = vaddq_f32(v150, v292); float32x4_t v379 = vsubq_f32(v152, v376); @@ -17321,8 +17321,8 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu32(const armral_cmplx_f32_t *restrict x, float32x4_t v1074 = vmulq_f32(v1072, v1143); float32x4_t v1075 = vaddq_f32(v337, v1065); float32x4_t v1076 = vsubq_f32(v337, v1065); - *(float32x4_t *)v2510 = v795; - *(float32x4_t *)v2528 = v796; + vst1q_f32((float32_t *)v2510, v795); + vst1q_f32((float32_t *)v2528, v796); float32x4_t v722 = vrev64q_f32(v716); float32x4_t v725 = vaddq_f32(v293, v715); float32x4_t v726 = vsubq_f32(v293, v715); @@ -17337,24 +17337,24 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu32(const armral_cmplx_f32_t *restrict x, float32x4_t v1078 = vaddq_f32(v338, v1074); float32x4_t v1133 = vfmaq_f32(v1111, v1117, v1118); float32x4_t v1134 = vfmaq_f32(v1124, v1130, v1131); - *(float32x4_t *)v2591 = v937; - *(float32x4_t *)v2609 = v938; - *(float32x4_t *)v2654 = v1075; - *(float32x4_t *)v2672 = v1076; + vst1q_f32((float32_t *)v2591, v937); + vst1q_f32((float32_t *)v2609, v938); + vst1q_f32((float32_t *)v2654, v1075); + vst1q_f32((float32_t *)v2672, v1076); float32x4_t v724 = vmulq_f32(v722, v1143); float32x4_t v855 = vaddq_f32(v853, v854); float32x4_t v856 = vsubq_f32(v854, v853); float32x4_t v1004 = vmulq_f32(v1002, v1143); float32x4_t v1135 = vaddq_f32(v1133, v1134); float32x4_t v1136 = vsubq_f32(v1134, v1133); - *(float32x4_t *)v2474 = v725; - *(float32x4_t *)v2492 = v726; - *(float32x4_t *)v2519 = v797; - *(float32x4_t *)v2537 = v798; - *(float32x4_t *)v2618 = v1005; - *(float32x4_t *)v2636 = v1006; - *(float32x4_t *)v2663 = v1077; - *(float32x4_t *)v2681 = v1078; + vst1q_f32((float32_t *)v2474, v725); + vst1q_f32((float32_t *)v2492, v726); + vst1q_f32((float32_t *)v2519, v797); + vst1q_f32((float32_t *)v2537, v798); + vst1q_f32((float32_t *)v2618, v1005); + vst1q_f32((float32_t *)v2636, v1006); + vst1q_f32((float32_t *)v2663, v1077); + vst1q_f32((float32_t *)v2681, v1078); float32x4_t v727 = vsubq_f32(v294, v724); float32x4_t v728 = vaddq_f32(v294, v724); float32x4_t v862 = vrev64q_f32(v856); @@ -17367,22 +17367,22 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu32(const armral_cmplx_f32_t *restrict x, float32x4_t v1146 = vsubq_f32(v379, v1135); float32x4_t v864 = vmulq_f32(v862, v1143); float32x4_t v1144 = vmulq_f32(v1142, v1143); - *(float32x4_t *)v2483 = v727; - *(float32x4_t *)v2501 = v728; - *(float32x4_t *)v2546 = v865; - *(float32x4_t *)v2564 = v866; - *(float32x4_t *)v2627 = v1007; - *(float32x4_t *)v2645 = v1008; - *(float32x4_t *)v2690 = v1145; - *(float32x4_t *)v2708 = v1146; + vst1q_f32((float32_t *)v2483, v727); + vst1q_f32((float32_t *)v2501, v728); + vst1q_f32((float32_t *)v2546, v865); + vst1q_f32((float32_t *)v2564, v866); + vst1q_f32((float32_t *)v2627, v1007); + vst1q_f32((float32_t *)v2645, v1008); + vst1q_f32((float32_t *)v2690, v1145); + vst1q_f32((float32_t *)v2708, v1146); float32x4_t v867 = vsubq_f32(v378, v864); float32x4_t v868 = vaddq_f32(v378, v864); float32x4_t v1147 = vsubq_f32(v380, v1144); float32x4_t v1148 = vaddq_f32(v380, v1144); - *(float32x4_t *)v2555 = v867; - *(float32x4_t *)v2573 = v868; - *(float32x4_t *)v2699 = v1147; - *(float32x4_t *)v2717 = v1148; + vst1q_f32((float32_t *)v2555, v867); + vst1q_f32((float32_t *)v2573, v868); + vst1q_f32((float32_t *)v2699, v1147); + vst1q_f32((float32_t *)v2717, v1148); v5 += 2 * 1; v6 += 2 * 1; } diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c index 72e53ca3e1fc0d78c88020e4edb1f49c98133148..2f58b4cfaf0cd876bbc4a7e4a438a1f35491c565 100644 --- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c +++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c @@ -25,20 +25,20 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu2(const armral_cmplx_f32_t *restrict x, float32x2_t *v182 = &v6[ostride]; const float32x2_t *v163 = &v5[0]; float32x2_t *v173 = &v6[0]; - float32x4_t v186 = *(const float32x4_t *)v153; + float32x4_t v186 = vld1q_f32((const float32_t *)v153); float32x4_t v42 = vtrn1q_f32(v186, v186); float32x4_t v43 = vtrn2q_f32(v186, v186); float32x4_t v47 = vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[0])); float32x4_t v49 = vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[1])); - float32x4_t v188 = *(const float32x4_t *)v163; + float32x4_t v188 = vld1q_f32((const float32_t *)v163); float32x4_t v48 = vmulq_f32(v42, v47); float32x4_t v51 = vfmaq_f32(v48, v43, v49); float32x4_t v59 = vaddq_f32(v188, v51); float32x4_t v60 = vsubq_f32(v188, v51); - *(float32x4_t *)v173 = v59; - *(float32x4_t *)v182 = v60; + vst1q_f32((float32_t *)v173, v59); + vst1q_f32((float32_t *)v182, v60); v5 += 2 * 1; v6 += 2 * 1; } @@ -129,7 +129,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu3(const armral_cmplx_f32_t *restrict x, float32x2_t v105 = (float32x2_t){v103, v104}; const float32x2_t *v265 = &v5[0]; float32x2_t *v275 = &v6[0]; - float32x4_t v297 = *(const float32x4_t *)v244; + float32x4_t v297 = vld1q_f32((const float32_t *)v244); float32x4_t v61 = vtrn1q_f32(v297, v297); float32x4_t v62 = vtrn2q_f32(v297, v297); float32x4_t v66 = @@ -144,10 +144,10 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu3(const armral_cmplx_f32_t *restrict x, float32x2_t v107 = vmul_f32(v106, v105); const float32x2_t *v253 = &v5[istride * 2]; float32x2_t *v293 = &v6[ostride * 2]; - float32x4_t v301 = *(const float32x4_t *)v265; + float32x4_t v301 = vld1q_f32((const float32_t *)v265); float32x4_t v67 = vmulq_f32(v61, v66); float32x4_t v109 = vcombine_f32(v107, v107); - float32x4_t v299 = *(const float32x4_t *)v253; + float32x4_t v299 = vld1q_f32((const float32_t *)v253); float32x4_t v70 = vfmaq_f32(v67, v62, v68); float32x4_t v73 = vtrn1q_f32(v299, v299); float32x4_t v74 = vtrn2q_f32(v299, v299); @@ -160,11 +160,11 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu3(const armral_cmplx_f32_t *restrict x, float32x4_t v108 = vrev64q_f32(v84); float32x4_t v110 = vmulq_f32(v108, v109); float32x4_t v111 = vaddq_f32(v92, v102); - *(float32x4_t *)v275 = v92; + vst1q_f32((float32_t *)v275, v92); float32x4_t v112 = vaddq_f32(v111, v110); float32x4_t v113 = vsubq_f32(v111, v110); - *(float32x4_t *)v284 = v113; - *(float32x4_t *)v293 = v112; + vst1q_f32((float32_t *)v284, v113); + vst1q_f32((float32_t *)v293, v112); v5 += 2 * 1; v6 += 2 * 1; } @@ -306,7 +306,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu4(const armral_cmplx_f32_t *restrict x, float32x2_t v144 = (float32x2_t){v142, v143}; const float32x2_t *v357 = &v5[0]; float32x2_t *v367 = &v6[0]; - float32x4_t v400 = *(const float32x4_t *)v336; + float32x4_t v400 = vld1q_f32((const float32_t *)v336); float32x4_t v47 = vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[2])); float32x4_t v49 = @@ -326,11 +326,11 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu4(const armral_cmplx_f32_t *restrict x, const float32x2_t *v345 = &v5[istride * 3]; float32x2_t *v385 = &v6[ostride * 2]; float32x2_t *v394 = &v6[ostride * 3]; - float32x4_t v404 = *(const float32x4_t *)v357; + float32x4_t v404 = vld1q_f32((const float32_t *)v357); float32x4_t v98 = vmulq_f32(v92, v97); float32x4_t v148 = vcombine_f32(v146, v146); - float32x4_t v398 = *(const float32x4_t *)v325; - float32x4_t v402 = *(const float32x4_t *)v345; + float32x4_t v398 = vld1q_f32((const float32_t *)v325); + float32x4_t v402 = vld1q_f32((const float32_t *)v345); float32x4_t v42 = vtrn1q_f32(v398, v398); float32x4_t v43 = vtrn2q_f32(v398, v398); float32x4_t v101 = vfmaq_f32(v98, v93, v99); @@ -348,12 +348,12 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu4(const armral_cmplx_f32_t *restrict x, float32x4_t v126 = vsubq_f32(v121, v123); float32x4_t v147 = vrev64q_f32(v124); float32x4_t v149 = vmulq_f32(v147, v148); - *(float32x4_t *)v367 = v125; - *(float32x4_t *)v385 = v126; + vst1q_f32((float32_t *)v367, v125); + vst1q_f32((float32_t *)v385, v126); float32x4_t v150 = vaddq_f32(v122, v149); float32x4_t v151 = vsubq_f32(v122, v149); - *(float32x4_t *)v376 = v151; - *(float32x4_t *)v394 = v150; + vst1q_f32((float32_t *)v376, v151); + vst1q_f32((float32_t *)v394, v150); v5 += 2 * 1; v6 += 2 * 1; } @@ -530,7 +530,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu5(const armral_cmplx_f32_t *restrict x, float32x2_t v193 = (float32x2_t){v191, v192}; const float32x2_t *v484 = &v5[0]; float32x2_t *v494 = &v6[0]; - float32x4_t v534 = *(const float32x4_t *)v441; + float32x4_t v534 = vld1q_f32((const float32_t *)v441); float32x4_t v61 = vtrn1q_f32(v534, v534); float32x4_t v62 = vtrn2q_f32(v534, v534); float32x4_t v66 = @@ -560,14 +560,14 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu5(const armral_cmplx_f32_t *restrict x, float32x2_t *v512 = &v6[ostride * 2]; float32x2_t *v521 = &v6[ostride * 3]; float32x2_t *v530 = &v6[ostride * 4]; - float32x4_t v542 = *(const float32x4_t *)v484; + float32x4_t v542 = vld1q_f32((const float32_t *)v484); float32x4_t v67 = vmulq_f32(v61, v66); float32x4_t v181 = vcombine_f32(v179, v179); float32x4_t v189 = vcombine_f32(v187, v187); float32x4_t v197 = vcombine_f32(v195, v195); - float32x4_t v536 = *(const float32x4_t *)v450; - float32x4_t v538 = *(const float32x4_t *)v461; - float32x4_t v540 = *(const float32x4_t *)v471; + float32x4_t v536 = vld1q_f32((const float32_t *)v450); + float32x4_t v538 = vld1q_f32((const float32_t *)v461); + float32x4_t v540 = vld1q_f32((const float32_t *)v471); float32x4_t v70 = vfmaq_f32(v67, v62, v68); float32x4_t v73 = vtrn1q_f32(v536, v536); float32x4_t v74 = vtrn2q_f32(v536, v536); @@ -598,7 +598,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu5(const armral_cmplx_f32_t *restrict x, float32x4_t v198 = vmulq_f32(v196, v197); float32x4_t v190 = vmulq_f32(v188, v189); float32x4_t v199 = vaddq_f32(v159, v169); - *(float32x4_t *)v494 = v159; + vst1q_f32((float32_t *)v494, v159); float32x4_t v200 = vaddq_f32(v199, v174); float32x4_t v201 = vsubq_f32(v199, v174); float32x4_t v202 = vsubq_f32(v182, v190); @@ -607,10 +607,10 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu5(const armral_cmplx_f32_t *restrict x, float32x4_t v205 = vsubq_f32(v200, v202); float32x4_t v206 = vaddq_f32(v201, v203); float32x4_t v207 = vsubq_f32(v201, v203); - *(float32x4_t *)v503 = v205; - *(float32x4_t *)v512 = v207; - *(float32x4_t *)v521 = v206; - *(float32x4_t *)v530 = v204; + vst1q_f32((float32_t *)v503, v205); + vst1q_f32((float32_t *)v512, v207); + vst1q_f32((float32_t *)v521, v206); + vst1q_f32((float32_t *)v530, v204); v5 += 2 * 1; v6 += 2 * 1; } @@ -856,7 +856,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu6(const armral_cmplx_f32_t *restrict x, float32x2_t v228 = (float32x2_t){v226, v227}; const float32x2_t *v559 = &v5[0]; float32x2_t *v569 = &v6[0]; - float32x4_t v626 = *(const float32x4_t *)v548; + float32x4_t v626 = vld1q_f32((const float32_t *)v548); float32x4_t v47 = vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[4])); float32x4_t v49 = @@ -889,13 +889,13 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu6(const armral_cmplx_f32_t *restrict x, float32x2_t *v587 = &v6[ostride * 4]; float32x2_t *v605 = &v6[ostride * 2]; float32x2_t *v614 = &v6[ostride * 5]; - float32x4_t v628 = *(const float32x4_t *)v559; + float32x4_t v628 = vld1q_f32((const float32_t *)v559); float32x4_t v172 = vmulq_f32(v166, v171); float32x4_t v232 = vcombine_f32(v230, v230); - float32x4_t v618 = *(const float32x4_t *)v505; - float32x4_t v620 = *(const float32x4_t *)v516; - float32x4_t v622 = *(const float32x4_t *)v526; - float32x4_t v624 = *(const float32x4_t *)v538; + float32x4_t v618 = vld1q_f32((const float32_t *)v505); + float32x4_t v620 = vld1q_f32((const float32_t *)v516); + float32x4_t v622 = vld1q_f32((const float32_t *)v526); + float32x4_t v624 = vld1q_f32((const float32_t *)v538); float32x4_t v42 = vtrn1q_f32(v618, v618); float32x4_t v43 = vtrn2q_f32(v618, v618); float32x4_t v92 = vtrn1q_f32(v620, v620); @@ -933,16 +933,16 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu6(const armral_cmplx_f32_t *restrict x, float32x4_t v210 = vaddq_f32(v191, v201); float32x4_t v233 = vmulq_f32(v231, v232); float32x4_t v234 = vaddq_f32(v215, v225); - *(float32x4_t *)v569 = v191; - *(float32x4_t *)v578 = v215; + vst1q_f32((float32_t *)v569, v191); + vst1q_f32((float32_t *)v578, v215); float32x4_t v211 = vaddq_f32(v210, v209); float32x4_t v212 = vsubq_f32(v210, v209); float32x4_t v235 = vaddq_f32(v234, v233); float32x4_t v236 = vsubq_f32(v234, v233); - *(float32x4_t *)v587 = v212; - *(float32x4_t *)v596 = v236; - *(float32x4_t *)v605 = v211; - *(float32x4_t *)v614 = v235; + vst1q_f32((float32_t *)v587, v212); + vst1q_f32((float32_t *)v596, v236); + vst1q_f32((float32_t *)v605, v211); + vst1q_f32((float32_t *)v614, v235); v5 += 2 * 1; v6 += 2 * 1; } @@ -1211,7 +1211,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu7(const armral_cmplx_f32_t *restrict x, float32x2_t v282 = (float32x2_t){v280, v281}; const float32x2_t *v713 = &v5[0]; float32x2_t *v723 = &v6[0]; - float32x4_t v781 = *(const float32x4_t *)v648; + float32x4_t v781 = vld1q_f32((const float32_t *)v648); float32x4_t v61 = vtrn1q_f32(v781, v781); float32x4_t v62 = vtrn2q_f32(v781, v781); float32x4_t v66 = @@ -1256,17 +1256,17 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu7(const armral_cmplx_f32_t *restrict x, float32x2_t *v759 = &v6[ostride * 4]; float32x2_t *v768 = &v6[ostride * 5]; float32x2_t *v777 = &v6[ostride * 6]; - float32x4_t v793 = *(const float32x4_t *)v713; + float32x4_t v793 = vld1q_f32((const float32_t *)v713); float32x4_t v67 = vmulq_f32(v61, v66); float32x4_t v262 = vcombine_f32(v260, v260); float32x4_t v270 = vcombine_f32(v268, v268); float32x4_t v278 = vcombine_f32(v276, v276); float32x4_t v286 = vcombine_f32(v284, v284); - float32x4_t v783 = *(const float32x4_t *)v657; - float32x4_t v785 = *(const float32x4_t *)v668; - float32x4_t v787 = *(const float32x4_t *)v678; - float32x4_t v789 = *(const float32x4_t *)v690; - float32x4_t v791 = *(const float32x4_t *)v700; + float32x4_t v783 = vld1q_f32((const float32_t *)v657); + float32x4_t v785 = vld1q_f32((const float32_t *)v668); + float32x4_t v787 = vld1q_f32((const float32_t *)v678); + float32x4_t v789 = vld1q_f32((const float32_t *)v690); + float32x4_t v791 = vld1q_f32((const float32_t *)v700); float32x4_t v70 = vfmaq_f32(v67, v62, v68); float32x4_t v73 = vtrn1q_f32(v783, v783); float32x4_t v74 = vtrn2q_f32(v783, v783); @@ -1318,7 +1318,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu7(const armral_cmplx_f32_t *restrict x, float32x4_t v287 = vmulq_f32(v285, v286); float32x4_t v263 = vmulq_f32(v261, v262); float32x4_t v288 = vaddq_f32(v222, v240); - *(float32x4_t *)v723 = v222; + vst1q_f32((float32_t *)v723, v222); float32x4_t v289 = vaddq_f32(v288, v245); float32x4_t v291 = vsubq_f32(v288, v245); float32x4_t v293 = vsubq_f32(v288, v250); @@ -1337,12 +1337,12 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu7(const armral_cmplx_f32_t *restrict x, float32x4_t v304 = vsubq_f32(v292, v298); float32x4_t v305 = vaddq_f32(v294, v300); float32x4_t v306 = vsubq_f32(v294, v300); - *(float32x4_t *)v732 = v302; - *(float32x4_t *)v741 = v304; - *(float32x4_t *)v750 = v305; - *(float32x4_t *)v759 = v306; - *(float32x4_t *)v768 = v303; - *(float32x4_t *)v777 = v301; + vst1q_f32((float32_t *)v732, v302); + vst1q_f32((float32_t *)v741, v304); + vst1q_f32((float32_t *)v750, v305); + vst1q_f32((float32_t *)v759, v306); + vst1q_f32((float32_t *)v768, v303); + vst1q_f32((float32_t *)v777, v301); v5 += 2 * 1; v6 += 2 * 1; } @@ -1715,7 +1715,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu8(const armral_cmplx_f32_t *restrict x, float32x2_t v307 = (float32x2_t){v306, v306}; const float32x2_t *v757 = &v5[0]; float32x2_t *v767 = &v6[0]; - float32x4_t v840 = *(const float32x4_t *)v714; + float32x4_t v840 = vld1q_f32((const float32_t *)v714); float32x4_t v47 = vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[6])); float32x4_t v49 = @@ -1761,16 +1761,16 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu8(const armral_cmplx_f32_t *restrict x, float32x2_t *v812 = &v6[ostride * 5]; float32x2_t *v821 = &v6[ostride * 6]; float32x2_t *v830 = &v6[ostride * 7]; - float32x4_t v848 = *(const float32x4_t *)v757; + float32x4_t v848 = vld1q_f32((const float32_t *)v757); float32x4_t v160 = vmulq_f32(v154, v159); float32x4_t v295 = vcombine_f32(v293, v293); float32x4_t v303 = vcombine_f32(v301, v301); - float32x4_t v834 = *(const float32x4_t *)v681; - float32x4_t v836 = *(const float32x4_t *)v692; - float32x4_t v838 = *(const float32x4_t *)v702; - float32x4_t v842 = *(const float32x4_t *)v723; - float32x4_t v844 = *(const float32x4_t *)v734; - float32x4_t v846 = *(const float32x4_t *)v744; + float32x4_t v834 = vld1q_f32((const float32_t *)v681); + float32x4_t v836 = vld1q_f32((const float32_t *)v692); + float32x4_t v838 = vld1q_f32((const float32_t *)v702); + float32x4_t v842 = vld1q_f32((const float32_t *)v723); + float32x4_t v844 = vld1q_f32((const float32_t *)v734); + float32x4_t v846 = vld1q_f32((const float32_t *)v744); float32x4_t v42 = vtrn1q_f32(v834, v834); float32x4_t v43 = vtrn2q_f32(v834, v834); float32x4_t v92 = vtrn1q_f32(v836, v836); @@ -1821,8 +1821,8 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu8(const armral_cmplx_f32_t *restrict x, float32x4_t v304 = vmulq_f32(v302, v303); float32x4_t v312 = vaddq_f32(v246, v309); float32x4_t v313 = vsubq_f32(v246, v309); - *(float32x4_t *)v767 = v257; - *(float32x4_t *)v803 = v258; + vst1q_f32((float32_t *)v767, v257); + vst1q_f32((float32_t *)v803, v258); float32x4_t v310 = vaddq_f32(v254, v283); float32x4_t v311 = vsubq_f32(v254, v283); float32x4_t v314 = vaddq_f32(v296, v304); @@ -1831,12 +1831,12 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu8(const armral_cmplx_f32_t *restrict x, float32x4_t v317 = vsubq_f32(v312, v314); float32x4_t v318 = vaddq_f32(v313, v315); float32x4_t v319 = vsubq_f32(v313, v315); - *(float32x4_t *)v785 = v311; - *(float32x4_t *)v821 = v310; - *(float32x4_t *)v776 = v317; - *(float32x4_t *)v794 = v318; - *(float32x4_t *)v812 = v319; - *(float32x4_t *)v830 = v316; + vst1q_f32((float32_t *)v785, v311); + vst1q_f32((float32_t *)v821, v310); + vst1q_f32((float32_t *)v776, v317); + vst1q_f32((float32_t *)v794, v318); + vst1q_f32((float32_t *)v812, v319); + vst1q_f32((float32_t *)v830, v316); v5 += 2 * 1; v6 += 2 * 1; } @@ -2187,7 +2187,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu9(const armral_cmplx_f32_t *restrict x, float32x2_t v360 = (float32x2_t){v358, v359}; const float32x2_t *v913 = &v5[0]; float32x2_t *v923 = &v6[0]; - float32x4_t v999 = *(const float32x4_t *)v826; + float32x4_t v999 = vld1q_f32((const float32_t *)v826); float32x4_t v61 = vtrn1q_f32(v999, v999); float32x4_t v62 = vtrn2q_f32(v999, v999); float32x4_t v66 = @@ -2245,19 +2245,19 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu9(const armral_cmplx_f32_t *restrict x, float32x2_t *v977 = &v6[ostride * 6]; float32x2_t *v986 = &v6[ostride * 7]; float32x2_t *v995 = &v6[ostride * 8]; - float32x4_t v1015 = *(const float32x4_t *)v913; + float32x4_t v1015 = vld1q_f32((const float32_t *)v913); float32x4_t v67 = vmulq_f32(v61, v66); float32x4_t v325 = vcombine_f32(v323, v323); float32x4_t v348 = vcombine_f32(v346, v346); float32x4_t v356 = vcombine_f32(v354, v354); float32x4_t v364 = vcombine_f32(v362, v362); - float32x4_t v1001 = *(const float32x4_t *)v835; - float32x4_t v1003 = *(const float32x4_t *)v846; - float32x4_t v1005 = *(const float32x4_t *)v856; - float32x4_t v1007 = *(const float32x4_t *)v868; - float32x4_t v1009 = *(const float32x4_t *)v878; - float32x4_t v1011 = *(const float32x4_t *)v890; - float32x4_t v1013 = *(const float32x4_t *)v900; + float32x4_t v1001 = vld1q_f32((const float32_t *)v835); + float32x4_t v1003 = vld1q_f32((const float32_t *)v846); + float32x4_t v1005 = vld1q_f32((const float32_t *)v856); + float32x4_t v1007 = vld1q_f32((const float32_t *)v868); + float32x4_t v1009 = vld1q_f32((const float32_t *)v878); + float32x4_t v1011 = vld1q_f32((const float32_t *)v890); + float32x4_t v1013 = vld1q_f32((const float32_t *)v900); float32x4_t v70 = vfmaq_f32(v67, v62, v68); float32x4_t v73 = vtrn1q_f32(v1001, v1001); float32x4_t v74 = vtrn2q_f32(v1001, v1001); @@ -2331,7 +2331,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu9(const armral_cmplx_f32_t *restrict x, float32x4_t v380 = vaddq_f32(v379, v357); float32x4_t v382 = vaddq_f32(v381, v365); float32x4_t v384 = vsubq_f32(v383, v365); - *(float32x4_t *)v923 = v287; + vst1q_f32((float32_t *)v923, v287); float32x4_t v368 = vaddq_f32(v287, v367); float32x4_t v372 = vaddq_f32(v371, v366); float32x4_t v369 = vaddq_f32(v368, v313); @@ -2342,20 +2342,20 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu9(const armral_cmplx_f32_t *restrict x, float32x4_t v374 = vaddq_f32(v373, v336); float32x4_t v376 = vaddq_f32(v375, v341); float32x4_t v378 = vsubq_f32(v377, v341); - *(float32x4_t *)v950 = v370; - *(float32x4_t *)v977 = v369; + vst1q_f32((float32_t *)v950, v370); + vst1q_f32((float32_t *)v977, v369); float32x4_t v385 = vaddq_f32(v374, v380); float32x4_t v386 = vsubq_f32(v374, v380); float32x4_t v387 = vaddq_f32(v376, v382); float32x4_t v388 = vsubq_f32(v376, v382); float32x4_t v389 = vaddq_f32(v378, v384); float32x4_t v390 = vsubq_f32(v378, v384); - *(float32x4_t *)v932 = v386; - *(float32x4_t *)v941 = v387; - *(float32x4_t *)v959 = v390; - *(float32x4_t *)v968 = v389; - *(float32x4_t *)v986 = v388; - *(float32x4_t *)v995 = v385; + vst1q_f32((float32_t *)v932, v386); + vst1q_f32((float32_t *)v941, v387); + vst1q_f32((float32_t *)v959, v390); + vst1q_f32((float32_t *)v968, v389); + vst1q_f32((float32_t *)v986, v388); + vst1q_f32((float32_t *)v995, v385); v5 += 2 * 1; v6 += 2 * 1; } @@ -2818,7 +2818,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu10(const armral_cmplx_f32_t *restrict x, float32x2_t v414 = (float32x2_t){v412, v413}; const float32x2_t *v1005 = &v5[0]; float32x2_t *v1015 = &v6[0]; - float32x4_t v1112 = *(const float32x4_t *)v972; + float32x4_t v1112 = vld1q_f32((const float32_t *)v972); float32x4_t v47 = vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[8])); float32x4_t v49 = @@ -2878,19 +2878,19 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu10(const armral_cmplx_f32_t *restrict x, float32x2_t *v1078 = &v6[ostride * 3]; float32x2_t *v1087 = &v6[ostride * 4]; float32x2_t *v1096 = &v6[ostride * 9]; - float32x4_t v1118 = *(const float32x4_t *)v1005; + float32x4_t v1118 = vld1q_f32((const float32_t *)v1005); float32x4_t v234 = vmulq_f32(v228, v233); float32x4_t v402 = vcombine_f32(v400, v400); float32x4_t v410 = vcombine_f32(v408, v408); float32x4_t v418 = vcombine_f32(v416, v416); - float32x4_t v1100 = *(const float32x4_t *)v907; - float32x4_t v1102 = *(const float32x4_t *)v918; - float32x4_t v1104 = *(const float32x4_t *)v928; - float32x4_t v1106 = *(const float32x4_t *)v940; - float32x4_t v1108 = *(const float32x4_t *)v950; - float32x4_t v1110 = *(const float32x4_t *)v962; - float32x4_t v1114 = *(const float32x4_t *)v982; - float32x4_t v1116 = *(const float32x4_t *)v992; + float32x4_t v1100 = vld1q_f32((const float32_t *)v907); + float32x4_t v1102 = vld1q_f32((const float32_t *)v918); + float32x4_t v1104 = vld1q_f32((const float32_t *)v928); + float32x4_t v1106 = vld1q_f32((const float32_t *)v940); + float32x4_t v1108 = vld1q_f32((const float32_t *)v950); + float32x4_t v1110 = vld1q_f32((const float32_t *)v962); + float32x4_t v1114 = vld1q_f32((const float32_t *)v982); + float32x4_t v1116 = vld1q_f32((const float32_t *)v992); float32x4_t v42 = vtrn1q_f32(v1100, v1100); float32x4_t v43 = vtrn2q_f32(v1100, v1100); float32x4_t v92 = vtrn1q_f32(v1102, v1102); @@ -2968,8 +2968,8 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu10(const armral_cmplx_f32_t *restrict x, float32x4_t v364 = vaddq_f32(v324, v334); float32x4_t v411 = vmulq_f32(v409, v410); float32x4_t v420 = vaddq_f32(v380, v390); - *(float32x4_t *)v1015 = v324; - *(float32x4_t *)v1024 = v380; + vst1q_f32((float32_t *)v1015, v324); + vst1q_f32((float32_t *)v1024, v380); float32x4_t v365 = vaddq_f32(v364, v339); float32x4_t v366 = vsubq_f32(v364, v339); float32x4_t v367 = vsubq_f32(v347, v355); @@ -2986,14 +2986,14 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu10(const armral_cmplx_f32_t *restrict x, float32x4_t v426 = vsubq_f32(v421, v423); float32x4_t v427 = vaddq_f32(v422, v424); float32x4_t v428 = vsubq_f32(v422, v424); - *(float32x4_t *)v1033 = v370; - *(float32x4_t *)v1042 = v426; - *(float32x4_t *)v1051 = v372; - *(float32x4_t *)v1060 = v428; - *(float32x4_t *)v1069 = v371; - *(float32x4_t *)v1078 = v427; - *(float32x4_t *)v1087 = v369; - *(float32x4_t *)v1096 = v425; + vst1q_f32((float32_t *)v1033, v370); + vst1q_f32((float32_t *)v1042, v426); + vst1q_f32((float32_t *)v1051, v372); + vst1q_f32((float32_t *)v1060, v428); + vst1q_f32((float32_t *)v1069, v371); + vst1q_f32((float32_t *)v1078, v427); + vst1q_f32((float32_t *)v1087, v369); + vst1q_f32((float32_t *)v1096, v425); v5 += 2 * 1; v6 += 2 * 1; } @@ -3490,7 +3490,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu11(const armral_cmplx_f32_t *restrict x, float32x2_t v504 = (float32x2_t){v502, v503}; const float32x2_t *v1269 = &v5[0]; float32x2_t *v1279 = &v6[0]; - float32x4_t v1373 = *(const float32x4_t *)v1160; + float32x4_t v1373 = vld1q_f32((const float32_t *)v1160); float32x4_t v213 = vtrn1q_f32(v1373, v1373); float32x4_t v214 = vtrn2q_f32(v1373, v1373); float32x4_t v218 = @@ -3571,7 +3571,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu11(const armral_cmplx_f32_t *restrict x, float32x2_t *v1342 = &v6[ostride * 4]; float32x2_t *v1351 = &v6[ostride * 3]; float32x2_t *v1360 = &v6[ostride * 2]; - float32x4_t v1393 = *(const float32x4_t *)v1269; + float32x4_t v1393 = vld1q_f32((const float32_t *)v1269); float32x4_t v219 = vmulq_f32(v213, v218); float32x4_t v391 = vcombine_f32(v389, v389); float32x4_t v444 = vcombine_f32(v442, v442); @@ -3583,15 +3583,15 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu11(const armral_cmplx_f32_t *restrict x, float32x4_t v492 = vcombine_f32(v490, v490); float32x4_t v500 = vcombine_f32(v498, v498); float32x4_t v508 = vcombine_f32(v506, v506); - float32x4_t v1375 = *(const float32x4_t *)v1169; - float32x4_t v1377 = *(const float32x4_t *)v1179; - float32x4_t v1379 = *(const float32x4_t *)v1189; - float32x4_t v1381 = *(const float32x4_t *)v1199; - float32x4_t v1383 = *(const float32x4_t *)v1209; - float32x4_t v1385 = *(const float32x4_t *)v1219; - float32x4_t v1387 = *(const float32x4_t *)v1229; - float32x4_t v1389 = *(const float32x4_t *)v1239; - float32x4_t v1391 = *(const float32x4_t *)v1249; + float32x4_t v1375 = vld1q_f32((const float32_t *)v1169); + float32x4_t v1377 = vld1q_f32((const float32_t *)v1179); + float32x4_t v1379 = vld1q_f32((const float32_t *)v1189); + float32x4_t v1381 = vld1q_f32((const float32_t *)v1199); + float32x4_t v1383 = vld1q_f32((const float32_t *)v1209); + float32x4_t v1385 = vld1q_f32((const float32_t *)v1219); + float32x4_t v1387 = vld1q_f32((const float32_t *)v1229); + float32x4_t v1389 = vld1q_f32((const float32_t *)v1239); + float32x4_t v1391 = vld1q_f32((const float32_t *)v1249); float32x4_t v222 = vfmaq_f32(v219, v214, v220); float32x4_t v225 = vtrn1q_f32(v1375, v1375); float32x4_t v226 = vtrn2q_f32(v1375, v1375); @@ -3713,7 +3713,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu11(const armral_cmplx_f32_t *restrict x, float32x4_t v524 = vsubq_f32(v493, v509); float32x4_t v525 = vaddq_f32(v515, v517); float32x4_t v543 = vaddq_f32(v519, v520); - *(float32x4_t *)v1279 = v354; + vst1q_f32((float32_t *)v1279, v354); float32x4_t v526 = vaddq_f32(v525, v510); float32x4_t v527 = vsubq_f32(v510, v512); float32x4_t v529 = vaddq_f32(v510, v516); @@ -3744,16 +3744,16 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu11(const armral_cmplx_f32_t *restrict x, float32x4_t v555 = vsubq_f32(v526, v536); float32x4_t v547 = vaddq_f32(v534, v546); float32x4_t v556 = vsubq_f32(v534, v546); - *(float32x4_t *)v1297 = v548; - *(float32x4_t *)v1306 = v549; - *(float32x4_t *)v1315 = v550; - *(float32x4_t *)v1324 = v551; - *(float32x4_t *)v1333 = v552; - *(float32x4_t *)v1342 = v553; - *(float32x4_t *)v1351 = v554; - *(float32x4_t *)v1360 = v555; - *(float32x4_t *)v1288 = v547; - *(float32x4_t *)v1369 = v556; + vst1q_f32((float32_t *)v1297, v548); + vst1q_f32((float32_t *)v1306, v549); + vst1q_f32((float32_t *)v1315, v550); + vst1q_f32((float32_t *)v1324, v551); + vst1q_f32((float32_t *)v1333, v552); + vst1q_f32((float32_t *)v1342, v553); + vst1q_f32((float32_t *)v1351, v554); + vst1q_f32((float32_t *)v1360, v555); + vst1q_f32((float32_t *)v1288, v547); + vst1q_f32((float32_t *)v1369, v556); v5 += 2 * 1; v6 += 2 * 1; } @@ -4464,7 +4464,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu12(const armral_cmplx_f32_t *restrict x, float32x2_t v439 = (float32x2_t){v438, v438}; const float32x2_t *v1096 = &v5[0]; float32x2_t *v1106 = &v6[0]; - float32x4_t v1225 = *(const float32x4_t *)v1065; + float32x4_t v1225 = vld1q_f32((const float32_t *)v1065); float32x4_t v66 = vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[6])); float32x4_t v68 = @@ -4536,21 +4536,21 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu12(const armral_cmplx_f32_t *restrict x, float32x2_t *v1187 = &v6[ostride * 3]; float32x2_t *v1196 = &v6[ostride * 7]; float32x2_t *v1205 = &v6[ostride * 11]; - float32x4_t v1231 = *(const float32x4_t *)v1096; + float32x4_t v1231 = vld1q_f32((const float32_t *)v1096); float32x4_t v291 = vmulq_f32(v285, v290); float32x4_t v372 = vcombine_f32(v370, v370); float32x4_t v403 = vcombine_f32(v401, v401); float32x4_t v435 = vcombine_f32(v433, v433); - float32x4_t v1209 = *(const float32x4_t *)v979; - float32x4_t v1211 = *(const float32x4_t *)v989; - float32x4_t v1213 = *(const float32x4_t *)v1001; - float32x4_t v1215 = *(const float32x4_t *)v1011; - float32x4_t v1217 = *(const float32x4_t *)v1023; - float32x4_t v1219 = *(const float32x4_t *)v1033; - float32x4_t v1221 = *(const float32x4_t *)v1043; - float32x4_t v1223 = *(const float32x4_t *)v1055; - float32x4_t v1227 = *(const float32x4_t *)v1074; - float32x4_t v1229 = *(const float32x4_t *)v1085; + float32x4_t v1209 = vld1q_f32((const float32_t *)v979); + float32x4_t v1211 = vld1q_f32((const float32_t *)v989); + float32x4_t v1213 = vld1q_f32((const float32_t *)v1001); + float32x4_t v1215 = vld1q_f32((const float32_t *)v1011); + float32x4_t v1217 = vld1q_f32((const float32_t *)v1023); + float32x4_t v1219 = vld1q_f32((const float32_t *)v1033); + float32x4_t v1221 = vld1q_f32((const float32_t *)v1043); + float32x4_t v1223 = vld1q_f32((const float32_t *)v1055); + float32x4_t v1227 = vld1q_f32((const float32_t *)v1074); + float32x4_t v1229 = vld1q_f32((const float32_t *)v1085); float32x4_t v61 = vtrn1q_f32(v1209, v1209); float32x4_t v62 = vtrn2q_f32(v1209, v1209); float32x4_t v73 = vtrn1q_f32(v1211, v1211); @@ -4642,8 +4642,8 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu12(const armral_cmplx_f32_t *restrict x, float32x4_t v443 = vsubq_f32(v436, v441); float32x4_t v444 = vaddq_f32(v349, v386); float32x4_t v492 = vaddq_f32(v350, v391); - *(float32x4_t *)v1106 = v349; - *(float32x4_t *)v1160 = v350; + vst1q_f32((float32_t *)v1106, v349); + vst1q_f32((float32_t *)v1160, v350); float32x4_t v374 = vaddq_f32(v346, v373); float32x4_t v375 = vsubq_f32(v346, v373); float32x4_t v445 = vaddq_f32(v444, v420); @@ -4652,20 +4652,20 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu12(const armral_cmplx_f32_t *restrict x, float32x4_t v494 = vsubq_f32(v492, v428); float32x4_t v468 = vaddq_f32(v375, v406); float32x4_t v516 = vaddq_f32(v374, v405); - *(float32x4_t *)v1115 = v446; - *(float32x4_t *)v1124 = v445; - *(float32x4_t *)v1133 = v375; - *(float32x4_t *)v1169 = v494; - *(float32x4_t *)v1178 = v493; - *(float32x4_t *)v1187 = v374; + vst1q_f32((float32_t *)v1115, v446); + vst1q_f32((float32_t *)v1124, v445); + vst1q_f32((float32_t *)v1133, v375); + vst1q_f32((float32_t *)v1169, v494); + vst1q_f32((float32_t *)v1178, v493); + vst1q_f32((float32_t *)v1187, v374); float32x4_t v469 = vaddq_f32(v468, v443); float32x4_t v470 = vsubq_f32(v468, v443); float32x4_t v517 = vaddq_f32(v516, v442); float32x4_t v518 = vsubq_f32(v516, v442); - *(float32x4_t *)v1142 = v470; - *(float32x4_t *)v1151 = v469; - *(float32x4_t *)v1196 = v518; - *(float32x4_t *)v1205 = v517; + vst1q_f32((float32_t *)v1142, v470); + vst1q_f32((float32_t *)v1151, v469); + vst1q_f32((float32_t *)v1196, v518); + vst1q_f32((float32_t *)v1205, v517); v5 += 2 * 1; v6 += 2 * 1; } @@ -5209,7 +5209,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu13(const armral_cmplx_f32_t *restrict x, float32x2_t v578 = (float32x2_t){v576, v577}; const float32x2_t *v1459 = &v5[0]; float32x2_t *v1469 = &v6[0]; - float32x4_t v1581 = *(const float32x4_t *)v1328; + float32x4_t v1581 = vld1q_f32((const float32_t *)v1328); float32x4_t v251 = vtrn1q_f32(v1581, v1581); float32x4_t v252 = vtrn2q_f32(v1581, v1581); float32x4_t v256 = @@ -5302,7 +5302,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu13(const armral_cmplx_f32_t *restrict x, float32x2_t *v1550 = &v6[ostride * 4]; float32x2_t *v1559 = &v6[ostride * 3]; float32x2_t *v1568 = &v6[ostride * 2]; - float32x4_t v1605 = *(const float32x4_t *)v1459; + float32x4_t v1605 = vld1q_f32((const float32_t *)v1459); float32x4_t v257 = vmulq_f32(v251, v256); float32x4_t v464 = vcombine_f32(v462, v462); float32x4_t v472 = vcombine_f32(v470, v470); @@ -5316,17 +5316,17 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu13(const armral_cmplx_f32_t *restrict x, float32x4_t v566 = vcombine_f32(v564, v564); float32x4_t v574 = vcombine_f32(v572, v572); float32x4_t v582 = vcombine_f32(v580, v580); - float32x4_t v1583 = *(const float32x4_t *)v1337; - float32x4_t v1585 = *(const float32x4_t *)v1347; - float32x4_t v1587 = *(const float32x4_t *)v1357; - float32x4_t v1589 = *(const float32x4_t *)v1367; - float32x4_t v1591 = *(const float32x4_t *)v1377; - float32x4_t v1593 = *(const float32x4_t *)v1387; - float32x4_t v1595 = *(const float32x4_t *)v1397; - float32x4_t v1597 = *(const float32x4_t *)v1407; - float32x4_t v1599 = *(const float32x4_t *)v1417; - float32x4_t v1601 = *(const float32x4_t *)v1427; - float32x4_t v1603 = *(const float32x4_t *)v1437; + float32x4_t v1583 = vld1q_f32((const float32_t *)v1337); + float32x4_t v1585 = vld1q_f32((const float32_t *)v1347); + float32x4_t v1587 = vld1q_f32((const float32_t *)v1357); + float32x4_t v1589 = vld1q_f32((const float32_t *)v1367); + float32x4_t v1591 = vld1q_f32((const float32_t *)v1377); + float32x4_t v1593 = vld1q_f32((const float32_t *)v1387); + float32x4_t v1595 = vld1q_f32((const float32_t *)v1397); + float32x4_t v1597 = vld1q_f32((const float32_t *)v1407); + float32x4_t v1599 = vld1q_f32((const float32_t *)v1417); + float32x4_t v1601 = vld1q_f32((const float32_t *)v1427); + float32x4_t v1603 = vld1q_f32((const float32_t *)v1437); float32x4_t v260 = vfmaq_f32(v257, v252, v258); float32x4_t v263 = vtrn1q_f32(v1583, v1583); float32x4_t v264 = vtrn2q_f32(v1583, v1583); @@ -5458,7 +5458,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu13(const armral_cmplx_f32_t *restrict x, float32x4_t v600 = vsubq_f32(v551, v559); float32x4_t v601 = vsubq_f32(v567, v583); float32x4_t v602 = vaddq_f32(v575, v583); - *(float32x4_t *)v1469 = v435; + vst1q_f32((float32_t *)v1469, v435); float32x4_t v588 = vaddq_f32(v587, v457); float32x4_t v590 = vsubq_f32(v589, v457); float32x4_t v591 = vaddq_f32(v584, v501); @@ -5499,18 +5499,18 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu13(const armral_cmplx_f32_t *restrict x, float32x4_t v632 = vaddq_f32(v607, v616); float32x4_t v633 = vsubq_f32(v606, v614); float32x4_t v634 = vaddq_f32(v605, v612); - *(float32x4_t *)v1478 = v623; - *(float32x4_t *)v1487 = v624; - *(float32x4_t *)v1496 = v625; - *(float32x4_t *)v1505 = v626; - *(float32x4_t *)v1514 = v627; - *(float32x4_t *)v1523 = v628; - *(float32x4_t *)v1532 = v629; - *(float32x4_t *)v1541 = v630; - *(float32x4_t *)v1550 = v631; - *(float32x4_t *)v1559 = v632; - *(float32x4_t *)v1568 = v633; - *(float32x4_t *)v1577 = v634; + vst1q_f32((float32_t *)v1478, v623); + vst1q_f32((float32_t *)v1487, v624); + vst1q_f32((float32_t *)v1496, v625); + vst1q_f32((float32_t *)v1505, v626); + vst1q_f32((float32_t *)v1514, v627); + vst1q_f32((float32_t *)v1523, v628); + vst1q_f32((float32_t *)v1532, v629); + vst1q_f32((float32_t *)v1541, v630); + vst1q_f32((float32_t *)v1550, v631); + vst1q_f32((float32_t *)v1559, v632); + vst1q_f32((float32_t *)v1568, v633); + vst1q_f32((float32_t *)v1577, v634); v5 += 2 * 1; v6 += 2 * 1; } @@ -6318,7 +6318,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu14(const armral_cmplx_f32_t *restrict x, float32x2_t v606 = (float32x2_t){v604, v605}; const float32x2_t *v1471 = &v5[0]; float32x2_t *v1481 = &v6[0]; - float32x4_t v1618 = *(const float32x4_t *)v1416; + float32x4_t v1618 = vld1q_f32((const float32_t *)v1416); float32x4_t v47 = vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[12])); float32x4_t v49 = @@ -6405,24 +6405,24 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu14(const armral_cmplx_f32_t *restrict x, float32x2_t *v1580 = &v6[ostride * 5]; float32x2_t *v1589 = &v6[ostride * 6]; float32x2_t *v1598 = &v6[ostride * 13]; - float32x4_t v1628 = *(const float32x4_t *)v1471; + float32x4_t v1628 = vld1q_f32((const float32_t *)v1471); float32x4_t v296 = vmulq_f32(v290, v295); float32x4_t v586 = vcombine_f32(v584, v584); float32x4_t v594 = vcombine_f32(v592, v592); float32x4_t v602 = vcombine_f32(v600, v600); float32x4_t v610 = vcombine_f32(v608, v608); - float32x4_t v1602 = *(const float32x4_t *)v1329; - float32x4_t v1604 = *(const float32x4_t *)v1340; - float32x4_t v1606 = *(const float32x4_t *)v1350; - float32x4_t v1608 = *(const float32x4_t *)v1362; - float32x4_t v1610 = *(const float32x4_t *)v1372; - float32x4_t v1612 = *(const float32x4_t *)v1384; - float32x4_t v1614 = *(const float32x4_t *)v1394; - float32x4_t v1616 = *(const float32x4_t *)v1406; - float32x4_t v1620 = *(const float32x4_t *)v1426; - float32x4_t v1622 = *(const float32x4_t *)v1436; - float32x4_t v1624 = *(const float32x4_t *)v1448; - float32x4_t v1626 = *(const float32x4_t *)v1458; + float32x4_t v1602 = vld1q_f32((const float32_t *)v1329); + float32x4_t v1604 = vld1q_f32((const float32_t *)v1340); + float32x4_t v1606 = vld1q_f32((const float32_t *)v1350); + float32x4_t v1608 = vld1q_f32((const float32_t *)v1362); + float32x4_t v1610 = vld1q_f32((const float32_t *)v1372); + float32x4_t v1612 = vld1q_f32((const float32_t *)v1384); + float32x4_t v1614 = vld1q_f32((const float32_t *)v1394); + float32x4_t v1616 = vld1q_f32((const float32_t *)v1406); + float32x4_t v1620 = vld1q_f32((const float32_t *)v1426); + float32x4_t v1622 = vld1q_f32((const float32_t *)v1436); + float32x4_t v1624 = vld1q_f32((const float32_t *)v1448); + float32x4_t v1626 = vld1q_f32((const float32_t *)v1458); float32x4_t v42 = vtrn1q_f32(v1602, v1602); float32x4_t v43 = vtrn2q_f32(v1602, v1602); float32x4_t v92 = vtrn1q_f32(v1604, v1604); @@ -6546,8 +6546,8 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu14(const armral_cmplx_f32_t *restrict x, float32x4_t v519 = vaddq_f32(v453, v471); float32x4_t v587 = vmulq_f32(v585, v586); float32x4_t v612 = vaddq_f32(v546, v564); - *(float32x4_t *)v1481 = v453; - *(float32x4_t *)v1490 = v546; + vst1q_f32((float32_t *)v1481, v453); + vst1q_f32((float32_t *)v1490, v546); float32x4_t v520 = vaddq_f32(v519, v476); float32x4_t v522 = vsubq_f32(v519, v476); float32x4_t v524 = vsubq_f32(v519, v481); @@ -6584,18 +6584,18 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu14(const armral_cmplx_f32_t *restrict x, float32x4_t v628 = vsubq_f32(v616, v622); float32x4_t v629 = vaddq_f32(v618, v624); float32x4_t v630 = vsubq_f32(v618, v624); - *(float32x4_t *)v1499 = v533; - *(float32x4_t *)v1508 = v626; - *(float32x4_t *)v1517 = v535; - *(float32x4_t *)v1526 = v628; - *(float32x4_t *)v1535 = v536; - *(float32x4_t *)v1544 = v629; - *(float32x4_t *)v1553 = v537; - *(float32x4_t *)v1562 = v630; - *(float32x4_t *)v1571 = v534; - *(float32x4_t *)v1580 = v627; - *(float32x4_t *)v1589 = v532; - *(float32x4_t *)v1598 = v625; + vst1q_f32((float32_t *)v1499, v533); + vst1q_f32((float32_t *)v1508, v626); + vst1q_f32((float32_t *)v1517, v535); + vst1q_f32((float32_t *)v1526, v628); + vst1q_f32((float32_t *)v1535, v536); + vst1q_f32((float32_t *)v1544, v629); + vst1q_f32((float32_t *)v1553, v537); + vst1q_f32((float32_t *)v1562, v630); + vst1q_f32((float32_t *)v1571, v534); + vst1q_f32((float32_t *)v1580, v627); + vst1q_f32((float32_t *)v1589, v532); + vst1q_f32((float32_t *)v1598, v625); v5 += 2 * 1; v6 += 2 * 1; } @@ -7332,7 +7332,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu15(const armral_cmplx_f32_t *restrict x, float32x2_t v585 = (float32x2_t){v584, v584}; const float32x2_t *v1454 = &v5[0]; float32x2_t *v1464 = &v6[0]; - float32x4_t v1606 = *(const float32x4_t *)v1369; + float32x4_t v1606 = vld1q_f32((const float32_t *)v1369); float32x4_t v66 = vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[8])); float32x4_t v68 = @@ -7434,7 +7434,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu15(const armral_cmplx_f32_t *restrict x, float32x2_t *v1572 = &v6[ostride * 9]; float32x2_t *v1581 = &v6[ostride * 4]; float32x2_t *v1590 = &v6[ostride * 14]; - float32x4_t v1622 = *(const float32x4_t *)v1454; + float32x4_t v1622 = vld1q_f32((const float32_t *)v1454); float32x4_t v222 = vmulq_f32(v216, v221); float32x4_t v458 = vcombine_f32(v456, v456); float32x4_t v466 = vcombine_f32(v464, v464); @@ -7445,19 +7445,19 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu15(const armral_cmplx_f32_t *restrict x, float32x4_t v555 = vcombine_f32(v553, v553); float32x4_t v563 = vcombine_f32(v561, v561); float32x4_t v571 = vcombine_f32(v569, v569); - float32x4_t v1594 = *(const float32x4_t *)v1305; - float32x4_t v1596 = *(const float32x4_t *)v1315; - float32x4_t v1598 = *(const float32x4_t *)v1327; - float32x4_t v1600 = *(const float32x4_t *)v1337; - float32x4_t v1602 = *(const float32x4_t *)v1349; - float32x4_t v1604 = *(const float32x4_t *)v1359; - float32x4_t v1608 = *(const float32x4_t *)v1379; - float32x4_t v1610 = *(const float32x4_t *)v1389; - float32x4_t v1612 = *(const float32x4_t *)v1399; - float32x4_t v1614 = *(const float32x4_t *)v1411; - float32x4_t v1616 = *(const float32x4_t *)v1421; - float32x4_t v1618 = *(const float32x4_t *)v1431; - float32x4_t v1620 = *(const float32x4_t *)v1443; + float32x4_t v1594 = vld1q_f32((const float32_t *)v1305); + float32x4_t v1596 = vld1q_f32((const float32_t *)v1315); + float32x4_t v1598 = vld1q_f32((const float32_t *)v1327); + float32x4_t v1600 = vld1q_f32((const float32_t *)v1337); + float32x4_t v1602 = vld1q_f32((const float32_t *)v1349); + float32x4_t v1604 = vld1q_f32((const float32_t *)v1359); + float32x4_t v1608 = vld1q_f32((const float32_t *)v1379); + float32x4_t v1610 = vld1q_f32((const float32_t *)v1389); + float32x4_t v1612 = vld1q_f32((const float32_t *)v1399); + float32x4_t v1614 = vld1q_f32((const float32_t *)v1411); + float32x4_t v1616 = vld1q_f32((const float32_t *)v1421); + float32x4_t v1618 = vld1q_f32((const float32_t *)v1431); + float32x4_t v1620 = vld1q_f32((const float32_t *)v1443); float32x4_t v61 = vtrn1q_f32(v1594, v1594); float32x4_t v62 = vtrn2q_f32(v1594, v1594); float32x4_t v73 = vtrn1q_f32(v1596, v1596); @@ -7583,7 +7583,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu15(const armral_cmplx_f32_t *restrict x, float32x4_t v536 = vaddq_f32(v523, v531); float32x4_t v556 = vmulq_f32(v554, v555); float32x4_t v597 = vaddq_f32(v436, v497); - *(float32x4_t *)v1464 = v436; + vst1q_f32((float32_t *)v1464, v436); float32x4_t v477 = vaddq_f32(v476, v451); float32x4_t v478 = vsubq_f32(v476, v451); float32x4_t v479 = vsubq_f32(v459, v467); @@ -7603,8 +7603,8 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu15(const armral_cmplx_f32_t *restrict x, float32x4_t v540 = vsubq_f32(v534, v536); float32x4_t v589 = vaddq_f32(v588, v572); float32x4_t v590 = vsubq_f32(v588, v572); - *(float32x4_t *)v1473 = v599; - *(float32x4_t *)v1482 = v598; + vst1q_f32((float32_t *)v1473, v599); + vst1q_f32((float32_t *)v1482, v598); float32x4_t v593 = vaddq_f32(v589, v591); float32x4_t v594 = vsubq_f32(v589, v591); float32x4_t v595 = vaddq_f32(v590, v592); @@ -7613,10 +7613,10 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu15(const armral_cmplx_f32_t *restrict x, float32x4_t v645 = vaddq_f32(v484, v540); float32x4_t v669 = vaddq_f32(v483, v539); float32x4_t v693 = vaddq_f32(v481, v537); - *(float32x4_t *)v1491 = v482; - *(float32x4_t *)v1518 = v484; - *(float32x4_t *)v1545 = v483; - *(float32x4_t *)v1572 = v481; + vst1q_f32((float32_t *)v1491, v482); + vst1q_f32((float32_t *)v1518, v484); + vst1q_f32((float32_t *)v1545, v483); + vst1q_f32((float32_t *)v1572, v481); float32x4_t v622 = vaddq_f32(v621, v594); float32x4_t v623 = vsubq_f32(v621, v594); float32x4_t v646 = vaddq_f32(v645, v596); @@ -7625,14 +7625,14 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu15(const armral_cmplx_f32_t *restrict x, float32x4_t v671 = vsubq_f32(v669, v595); float32x4_t v694 = vaddq_f32(v693, v593); float32x4_t v695 = vsubq_f32(v693, v593); - *(float32x4_t *)v1500 = v623; - *(float32x4_t *)v1509 = v622; - *(float32x4_t *)v1527 = v647; - *(float32x4_t *)v1536 = v646; - *(float32x4_t *)v1554 = v671; - *(float32x4_t *)v1563 = v670; - *(float32x4_t *)v1581 = v695; - *(float32x4_t *)v1590 = v694; + vst1q_f32((float32_t *)v1500, v623); + vst1q_f32((float32_t *)v1509, v622); + vst1q_f32((float32_t *)v1527, v647); + vst1q_f32((float32_t *)v1536, v646); + vst1q_f32((float32_t *)v1554, v671); + vst1q_f32((float32_t *)v1563, v670); + vst1q_f32((float32_t *)v1581, v695); + vst1q_f32((float32_t *)v1590, v694); v5 += 2 * 1; v6 += 2 * 1; } @@ -8392,7 +8392,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu16(const armral_cmplx_f32_t *restrict x, float32x2_t v644 = (float32x2_t){v643, v643}; const float32x2_t *v1625 = &v5[0]; float32x2_t *v1635 = &v6[0]; - float32x4_t v1788 = *(const float32x4_t *)v1538; + float32x4_t v1788 = vld1q_f32((const float32_t *)v1538); float32x4_t v47 = vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[14])); float32x4_t v49 = @@ -8492,27 +8492,27 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu16(const armral_cmplx_f32_t *restrict x, float32x2_t *v1752 = &v6[ostride * 13]; float32x2_t *v1761 = &v6[ostride * 14]; float32x2_t *v1770 = &v6[ostride * 15]; - float32x4_t v1804 = *(const float32x4_t *)v1625; + float32x4_t v1804 = vld1q_f32((const float32_t *)v1625); float32x4_t v284 = vmulq_f32(v278, v283); float32x4_t v593 = vcombine_f32(v591, v591); float32x4_t v601 = vcombine_f32(v599, v599); float32x4_t v614 = vcombine_f32(v612, v612); float32x4_t v622 = vcombine_f32(v620, v620); float32x4_t v630 = vcombine_f32(v628, v628); - float32x4_t v1774 = *(const float32x4_t *)v1461; - float32x4_t v1776 = *(const float32x4_t *)v1472; - float32x4_t v1778 = *(const float32x4_t *)v1482; - float32x4_t v1780 = *(const float32x4_t *)v1494; - float32x4_t v1782 = *(const float32x4_t *)v1504; - float32x4_t v1784 = *(const float32x4_t *)v1516; - float32x4_t v1786 = *(const float32x4_t *)v1526; - float32x4_t v1790 = *(const float32x4_t *)v1547; - float32x4_t v1792 = *(const float32x4_t *)v1558; - float32x4_t v1794 = *(const float32x4_t *)v1568; - float32x4_t v1796 = *(const float32x4_t *)v1580; - float32x4_t v1798 = *(const float32x4_t *)v1590; - float32x4_t v1800 = *(const float32x4_t *)v1602; - float32x4_t v1802 = *(const float32x4_t *)v1612; + float32x4_t v1774 = vld1q_f32((const float32_t *)v1461); + float32x4_t v1776 = vld1q_f32((const float32_t *)v1472); + float32x4_t v1778 = vld1q_f32((const float32_t *)v1482); + float32x4_t v1780 = vld1q_f32((const float32_t *)v1494); + float32x4_t v1782 = vld1q_f32((const float32_t *)v1504); + float32x4_t v1784 = vld1q_f32((const float32_t *)v1516); + float32x4_t v1786 = vld1q_f32((const float32_t *)v1526); + float32x4_t v1790 = vld1q_f32((const float32_t *)v1547); + float32x4_t v1792 = vld1q_f32((const float32_t *)v1558); + float32x4_t v1794 = vld1q_f32((const float32_t *)v1568); + float32x4_t v1796 = vld1q_f32((const float32_t *)v1580); + float32x4_t v1798 = vld1q_f32((const float32_t *)v1590); + float32x4_t v1800 = vld1q_f32((const float32_t *)v1602); + float32x4_t v1802 = vld1q_f32((const float32_t *)v1612); float32x4_t v42 = vtrn1q_f32(v1774, v1774); float32x4_t v43 = vtrn2q_f32(v1774, v1774); float32x4_t v92 = vtrn1q_f32(v1776, v1776); @@ -8641,8 +8641,8 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu16(const armral_cmplx_f32_t *restrict x, float32x4_t v664 = vsubq_f32(v646, v636); float32x4_t v665 = vsubq_f32(v636, v641); float32x4_t v666 = vsubq_f32(v636, v646); - *(float32x4_t *)v1635 = v521; - *(float32x4_t *)v1707 = v522; + vst1q_f32((float32_t *)v1635, v521); + vst1q_f32((float32_t *)v1707, v522); float32x4_t v647 = vaddq_f32(v518, v555); float32x4_t v648 = vsubq_f32(v518, v555); float32x4_t v650 = vaddq_f32(v568, v576); @@ -8667,8 +8667,8 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu16(const armral_cmplx_f32_t *restrict x, float32x4_t v680 = vsubq_f32(v662, v664); float32x4_t v681 = vaddq_f32(v662, v660); float32x4_t v682 = vsubq_f32(v662, v660); - *(float32x4_t *)v1671 = v648; - *(float32x4_t *)v1743 = v647; + vst1q_f32((float32_t *)v1671, v648); + vst1q_f32((float32_t *)v1743, v647); float32x4_t v683 = vaddq_f32(v667, v677); float32x4_t v684 = vaddq_f32(v668, v678); float32x4_t v685 = vsubq_f32(v669, v678); @@ -8677,18 +8677,18 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu16(const armral_cmplx_f32_t *restrict x, float32x4_t v688 = vaddq_f32(v672, v680); float32x4_t v689 = vsubq_f32(v673, v682); float32x4_t v690 = vsubq_f32(v674, v681); - *(float32x4_t *)v1653 = v656; - *(float32x4_t *)v1689 = v655; - *(float32x4_t *)v1725 = v654; - *(float32x4_t *)v1761 = v653; - *(float32x4_t *)v1644 = v686; - *(float32x4_t *)v1662 = v689; - *(float32x4_t *)v1680 = v690; - *(float32x4_t *)v1698 = v685; - *(float32x4_t *)v1716 = v684; - *(float32x4_t *)v1734 = v687; - *(float32x4_t *)v1752 = v688; - *(float32x4_t *)v1770 = v683; + vst1q_f32((float32_t *)v1653, v656); + vst1q_f32((float32_t *)v1689, v655); + vst1q_f32((float32_t *)v1725, v654); + vst1q_f32((float32_t *)v1761, v653); + vst1q_f32((float32_t *)v1644, v686); + vst1q_f32((float32_t *)v1662, v689); + vst1q_f32((float32_t *)v1680, v690); + vst1q_f32((float32_t *)v1698, v685); + vst1q_f32((float32_t *)v1716, v684); + vst1q_f32((float32_t *)v1734, v687); + vst1q_f32((float32_t *)v1752, v688); + vst1q_f32((float32_t *)v1770, v683); v5 += 2 * 1; v6 += 2 * 1; } @@ -9516,7 +9516,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu17(const armral_cmplx_f32_t *restrict x, float32x2_t v828 = (float32x2_t){v826, v827}; const float32x2_t *v2090 = &v5[0]; float32x2_t *v2100 = &v6[0]; - float32x4_t v2248 = *(const float32x4_t *)v1915; + float32x4_t v2248 = vld1q_f32((const float32_t *)v1915); float32x4_t v61 = vtrn1q_f32(v2248, v2248); float32x4_t v62 = vtrn2q_f32(v2248, v2248); float32x4_t v66 = @@ -9648,7 +9648,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu17(const armral_cmplx_f32_t *restrict x, float32x2_t *v2226 = &v6[ostride * 10]; float32x2_t *v2235 = &v6[ostride * 8]; float32x2_t *v2244 = &v6[ostride * 9]; - float32x4_t v2280 = *(const float32x4_t *)v2090; + float32x4_t v2280 = vld1q_f32((const float32_t *)v2090); float32x4_t v67 = vmulq_f32(v61, v66); float32x4_t v672 = vcombine_f32(v670, v670); float32x4_t v680 = vcombine_f32(v678, v678); @@ -9671,21 +9671,21 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu17(const armral_cmplx_f32_t *restrict x, float32x4_t v816 = vcombine_f32(v814, v814); float32x4_t v824 = vcombine_f32(v822, v822); float32x4_t v832 = vcombine_f32(v830, v830); - float32x4_t v2250 = *(const float32x4_t *)v1924; - float32x4_t v2252 = *(const float32x4_t *)v1935; - float32x4_t v2254 = *(const float32x4_t *)v1945; - float32x4_t v2256 = *(const float32x4_t *)v1957; - float32x4_t v2258 = *(const float32x4_t *)v1967; - float32x4_t v2260 = *(const float32x4_t *)v1979; - float32x4_t v2262 = *(const float32x4_t *)v1989; - float32x4_t v2264 = *(const float32x4_t *)v2001; - float32x4_t v2266 = *(const float32x4_t *)v2011; - float32x4_t v2268 = *(const float32x4_t *)v2023; - float32x4_t v2270 = *(const float32x4_t *)v2033; - float32x4_t v2272 = *(const float32x4_t *)v2045; - float32x4_t v2274 = *(const float32x4_t *)v2055; - float32x4_t v2276 = *(const float32x4_t *)v2067; - float32x4_t v2278 = *(const float32x4_t *)v2077; + float32x4_t v2250 = vld1q_f32((const float32_t *)v1924); + float32x4_t v2252 = vld1q_f32((const float32_t *)v1935); + float32x4_t v2254 = vld1q_f32((const float32_t *)v1945); + float32x4_t v2256 = vld1q_f32((const float32_t *)v1957); + float32x4_t v2258 = vld1q_f32((const float32_t *)v1967); + float32x4_t v2260 = vld1q_f32((const float32_t *)v1979); + float32x4_t v2262 = vld1q_f32((const float32_t *)v1989); + float32x4_t v2264 = vld1q_f32((const float32_t *)v2001); + float32x4_t v2266 = vld1q_f32((const float32_t *)v2011); + float32x4_t v2268 = vld1q_f32((const float32_t *)v2023); + float32x4_t v2270 = vld1q_f32((const float32_t *)v2033); + float32x4_t v2272 = vld1q_f32((const float32_t *)v2045); + float32x4_t v2274 = vld1q_f32((const float32_t *)v2055); + float32x4_t v2276 = vld1q_f32((const float32_t *)v2067); + float32x4_t v2278 = vld1q_f32((const float32_t *)v2077); float32x4_t v70 = vfmaq_f32(v67, v62, v68); float32x4_t v73 = vtrn1q_f32(v2250, v2250); float32x4_t v74 = vtrn2q_f32(v2250, v2250); @@ -9871,7 +9871,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu17(const armral_cmplx_f32_t *restrict x, float32x4_t v840 = vsubq_f32(v665, v625); float32x4_t v841 = vaddq_f32(v665, v620); float32x4_t v842 = vaddq_f32(v630, v590); - *(float32x4_t *)v2100 = v590; + vst1q_f32((float32_t *)v2100, v590); float32x4_t v581 = vsubq_f32(v580, v532); float32x4_t v815 = vrev64q_f32(v577); float32x4_t v843 = vaddq_f32(v635, v842); @@ -9940,8 +9940,8 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu17(const armral_cmplx_f32_t *restrict x, float32x4_t v888 = vaddq_f32(v880, v886); float32x4_t v894 = vaddq_f32(v893, v886); float32x4_t v905 = vaddq_f32(v904, v886); - *(float32x4_t *)v2145 = v947; - *(float32x4_t *)v2154 = v955; + vst1q_f32((float32_t *)v2145, v947); + vst1q_f32((float32_t *)v2154, v955); float32x4_t v890 = vaddq_f32(v889, v881); float32x4_t v892 = vaddq_f32(v891, v884); float32x4_t v896 = vsubq_f32(v895, v888); @@ -9959,24 +9959,24 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu17(const armral_cmplx_f32_t *restrict x, float32x4_t v987 = vsubq_f32(v856, v896); float32x4_t v995 = vaddq_f32(v854, v892); float32x4_t v1003 = vsubq_f32(v854, v892); - *(float32x4_t *)v2127 = v931; - *(float32x4_t *)v2136 = v939; - *(float32x4_t *)v2235 = v1027; - *(float32x4_t *)v2244 = v1035; + vst1q_f32((float32_t *)v2127, v931); + vst1q_f32((float32_t *)v2136, v939); + vst1q_f32((float32_t *)v2235, v1027); + vst1q_f32((float32_t *)v2244, v1035); float32x4_t v963 = vaddq_f32(v857, v899); float32x4_t v971 = vsubq_f32(v857, v899); float32x4_t v1011 = vaddq_f32(v858, v902); float32x4_t v1019 = vsubq_f32(v858, v902); - *(float32x4_t *)v2109 = v915; - *(float32x4_t *)v2118 = v923; - *(float32x4_t *)v2181 = v979; - *(float32x4_t *)v2190 = v987; - *(float32x4_t *)v2199 = v995; - *(float32x4_t *)v2208 = v1003; - *(float32x4_t *)v2163 = v963; - *(float32x4_t *)v2172 = v971; - *(float32x4_t *)v2217 = v1011; - *(float32x4_t *)v2226 = v1019; + vst1q_f32((float32_t *)v2109, v915); + vst1q_f32((float32_t *)v2118, v923); + vst1q_f32((float32_t *)v2181, v979); + vst1q_f32((float32_t *)v2190, v987); + vst1q_f32((float32_t *)v2199, v995); + vst1q_f32((float32_t *)v2208, v1003); + vst1q_f32((float32_t *)v2163, v963); + vst1q_f32((float32_t *)v2172, v971); + vst1q_f32((float32_t *)v2217, v1011); + vst1q_f32((float32_t *)v2226, v1019); v5 += 2 * 1; v6 += 2 * 1; } @@ -11163,7 +11163,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu18(const armral_cmplx_f32_t *restrict x, float32x2_t v772 = (float32x2_t){v770, v771}; const float32x2_t *v1879 = &v5[0]; float32x2_t *v1889 = &v6[0]; - float32x4_t v2066 = *(const float32x4_t *)v1802; + float32x4_t v2066 = vld1q_f32((const float32_t *)v1802); float32x4_t v47 = vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[16])); float32x4_t v49 = @@ -11275,28 +11275,28 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu18(const armral_cmplx_f32_t *restrict x, float32x2_t *v2024 = &v6[ostride * 7]; float32x2_t *v2033 = &v6[ostride * 8]; float32x2_t *v2042 = &v6[ostride * 17]; - float32x4_t v2080 = *(const float32x4_t *)v1879; + float32x4_t v2080 = vld1q_f32((const float32_t *)v1879); float32x4_t v358 = vmulq_f32(v352, v357); float32x4_t v737 = vcombine_f32(v735, v735); float32x4_t v760 = vcombine_f32(v758, v758); float32x4_t v768 = vcombine_f32(v766, v766); float32x4_t v776 = vcombine_f32(v774, v774); - float32x4_t v2046 = *(const float32x4_t *)v1693; - float32x4_t v2048 = *(const float32x4_t *)v1704; - float32x4_t v2050 = *(const float32x4_t *)v1714; - float32x4_t v2052 = *(const float32x4_t *)v1726; - float32x4_t v2054 = *(const float32x4_t *)v1736; - float32x4_t v2056 = *(const float32x4_t *)v1748; - float32x4_t v2058 = *(const float32x4_t *)v1758; - float32x4_t v2060 = *(const float32x4_t *)v1770; - float32x4_t v2062 = *(const float32x4_t *)v1780; - float32x4_t v2064 = *(const float32x4_t *)v1792; - float32x4_t v2068 = *(const float32x4_t *)v1812; - float32x4_t v2070 = *(const float32x4_t *)v1822; - float32x4_t v2072 = *(const float32x4_t *)v1834; - float32x4_t v2074 = *(const float32x4_t *)v1844; - float32x4_t v2076 = *(const float32x4_t *)v1856; - float32x4_t v2078 = *(const float32x4_t *)v1866; + float32x4_t v2046 = vld1q_f32((const float32_t *)v1693); + float32x4_t v2048 = vld1q_f32((const float32_t *)v1704); + float32x4_t v2050 = vld1q_f32((const float32_t *)v1714); + float32x4_t v2052 = vld1q_f32((const float32_t *)v1726); + float32x4_t v2054 = vld1q_f32((const float32_t *)v1736); + float32x4_t v2056 = vld1q_f32((const float32_t *)v1748); + float32x4_t v2058 = vld1q_f32((const float32_t *)v1758); + float32x4_t v2060 = vld1q_f32((const float32_t *)v1770); + float32x4_t v2062 = vld1q_f32((const float32_t *)v1780); + float32x4_t v2064 = vld1q_f32((const float32_t *)v1792); + float32x4_t v2068 = vld1q_f32((const float32_t *)v1812); + float32x4_t v2070 = vld1q_f32((const float32_t *)v1822); + float32x4_t v2072 = vld1q_f32((const float32_t *)v1834); + float32x4_t v2074 = vld1q_f32((const float32_t *)v1844); + float32x4_t v2076 = vld1q_f32((const float32_t *)v1856); + float32x4_t v2078 = vld1q_f32((const float32_t *)v1866); float32x4_t v42 = vtrn1q_f32(v2046, v2046); float32x4_t v43 = vtrn2q_f32(v2046, v2046); float32x4_t v92 = vtrn1q_f32(v2048, v2048); @@ -11468,8 +11468,8 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu18(const armral_cmplx_f32_t *restrict x, float32x4_t v792 = vaddq_f32(v791, v769); float32x4_t v794 = vaddq_f32(v793, v777); float32x4_t v796 = vsubq_f32(v795, v777); - *(float32x4_t *)v1889 = v584; - *(float32x4_t *)v1898 = v699; + vst1q_f32((float32_t *)v1889, v584); + vst1q_f32((float32_t *)v1898, v699); float32x4_t v665 = vaddq_f32(v584, v664); float32x4_t v669 = vaddq_f32(v668, v663); float32x4_t v780 = vaddq_f32(v699, v779); @@ -11490,10 +11490,10 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu18(const armral_cmplx_f32_t *restrict x, float32x4_t v786 = vaddq_f32(v785, v748); float32x4_t v788 = vaddq_f32(v787, v753); float32x4_t v790 = vsubq_f32(v789, v753); - *(float32x4_t *)v1943 = v667; - *(float32x4_t *)v1952 = v782; - *(float32x4_t *)v1997 = v666; - *(float32x4_t *)v2006 = v781; + vst1q_f32((float32_t *)v1943, v667); + vst1q_f32((float32_t *)v1952, v782); + vst1q_f32((float32_t *)v1997, v666); + vst1q_f32((float32_t *)v2006, v781); float32x4_t v682 = vaddq_f32(v671, v677); float32x4_t v683 = vsubq_f32(v671, v677); float32x4_t v684 = vaddq_f32(v673, v679); @@ -11506,18 +11506,18 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu18(const armral_cmplx_f32_t *restrict x, float32x4_t v800 = vsubq_f32(v788, v794); float32x4_t v801 = vaddq_f32(v790, v796); float32x4_t v802 = vsubq_f32(v790, v796); - *(float32x4_t *)v1907 = v683; - *(float32x4_t *)v1916 = v798; - *(float32x4_t *)v1925 = v684; - *(float32x4_t *)v1934 = v799; - *(float32x4_t *)v1961 = v687; - *(float32x4_t *)v1970 = v802; - *(float32x4_t *)v1979 = v686; - *(float32x4_t *)v1988 = v801; - *(float32x4_t *)v2015 = v685; - *(float32x4_t *)v2024 = v800; - *(float32x4_t *)v2033 = v682; - *(float32x4_t *)v2042 = v797; + vst1q_f32((float32_t *)v1907, v683); + vst1q_f32((float32_t *)v1916, v798); + vst1q_f32((float32_t *)v1925, v684); + vst1q_f32((float32_t *)v1934, v799); + vst1q_f32((float32_t *)v1961, v687); + vst1q_f32((float32_t *)v1970, v802); + vst1q_f32((float32_t *)v1979, v686); + vst1q_f32((float32_t *)v1988, v801); + vst1q_f32((float32_t *)v2015, v685); + vst1q_f32((float32_t *)v2024, v800); + vst1q_f32((float32_t *)v2033, v682); + vst1q_f32((float32_t *)v2042, v797); v5 += 2 * 1; v6 += 2 * 1; } @@ -12482,7 +12482,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu19(const armral_cmplx_f32_t *restrict x, float32x2_t v915 = (float32x2_t){v913, v914}; const float32x2_t *v2321 = &v5[0]; float32x2_t *v2331 = &v6[0]; - float32x4_t v2497 = *(const float32x4_t *)v2124; + float32x4_t v2497 = vld1q_f32((const float32_t *)v2124); float32x4_t v61 = vtrn1q_f32(v2497, v2497); float32x4_t v62 = vtrn2q_f32(v2497, v2497); float32x4_t v66 = @@ -12629,7 +12629,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu19(const armral_cmplx_f32_t *restrict x, float32x2_t *v2475 = &v6[ostride * 11]; float32x2_t *v2484 = &v6[ostride * 9]; float32x2_t *v2493 = &v6[ostride * 10]; - float32x4_t v2533 = *(const float32x4_t *)v2321; + float32x4_t v2533 = vld1q_f32((const float32_t *)v2321); float32x4_t v67 = vmulq_f32(v61, v66); float32x4_t v775 = vcombine_f32(v773, v773); float32x4_t v783 = vcombine_f32(v781, v781); @@ -12650,23 +12650,23 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu19(const armral_cmplx_f32_t *restrict x, float32x4_t v903 = vcombine_f32(v901, v901); float32x4_t v911 = vcombine_f32(v909, v909); float32x4_t v919 = vcombine_f32(v917, v917); - float32x4_t v2499 = *(const float32x4_t *)v2133; - float32x4_t v2501 = *(const float32x4_t *)v2144; - float32x4_t v2503 = *(const float32x4_t *)v2154; - float32x4_t v2505 = *(const float32x4_t *)v2166; - float32x4_t v2507 = *(const float32x4_t *)v2176; - float32x4_t v2509 = *(const float32x4_t *)v2188; - float32x4_t v2511 = *(const float32x4_t *)v2198; - float32x4_t v2513 = *(const float32x4_t *)v2210; - float32x4_t v2515 = *(const float32x4_t *)v2220; - float32x4_t v2517 = *(const float32x4_t *)v2232; - float32x4_t v2519 = *(const float32x4_t *)v2242; - float32x4_t v2521 = *(const float32x4_t *)v2254; - float32x4_t v2523 = *(const float32x4_t *)v2264; - float32x4_t v2525 = *(const float32x4_t *)v2276; - float32x4_t v2527 = *(const float32x4_t *)v2286; - float32x4_t v2529 = *(const float32x4_t *)v2298; - float32x4_t v2531 = *(const float32x4_t *)v2308; + float32x4_t v2499 = vld1q_f32((const float32_t *)v2133); + float32x4_t v2501 = vld1q_f32((const float32_t *)v2144); + float32x4_t v2503 = vld1q_f32((const float32_t *)v2154); + float32x4_t v2505 = vld1q_f32((const float32_t *)v2166); + float32x4_t v2507 = vld1q_f32((const float32_t *)v2176); + float32x4_t v2509 = vld1q_f32((const float32_t *)v2188); + float32x4_t v2511 = vld1q_f32((const float32_t *)v2198); + float32x4_t v2513 = vld1q_f32((const float32_t *)v2210); + float32x4_t v2515 = vld1q_f32((const float32_t *)v2220); + float32x4_t v2517 = vld1q_f32((const float32_t *)v2232); + float32x4_t v2519 = vld1q_f32((const float32_t *)v2242); + float32x4_t v2521 = vld1q_f32((const float32_t *)v2254); + float32x4_t v2523 = vld1q_f32((const float32_t *)v2264); + float32x4_t v2525 = vld1q_f32((const float32_t *)v2276); + float32x4_t v2527 = vld1q_f32((const float32_t *)v2286); + float32x4_t v2529 = vld1q_f32((const float32_t *)v2298); + float32x4_t v2531 = vld1q_f32((const float32_t *)v2308); float32x4_t v70 = vfmaq_f32(v67, v62, v68); float32x4_t v73 = vtrn1q_f32(v2499, v2499); float32x4_t v74 = vtrn2q_f32(v2499, v2499); @@ -12884,7 +12884,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu19(const armral_cmplx_f32_t *restrict x, float32x4_t v947 = vaddq_f32(v678, v620); float32x4_t v952 = vaddq_f32(v784, v792); float32x4_t v953 = vaddq_f32(v808, v816); - *(float32x4_t *)v2331 = v620; + vst1q_f32((float32_t *)v2331, v620); float32x4_t v896 = vmulq_f32(v894, v895); float32x4_t v923 = vaddq_f32(v748, v753); float32x4_t v927 = vaddq_f32(v743, v753); @@ -12968,34 +12968,34 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu19(const armral_cmplx_f32_t *restrict x, float32x4_t v1085 = vsubq_f32(v987, v999); float32x4_t v1125 = vsubq_f32(v991, v1003); float32x4_t v1133 = vaddq_f32(v991, v1003); - *(float32x4_t *)v2358 = v1029; - *(float32x4_t *)v2367 = v1037; - *(float32x4_t *)v2376 = v1045; - *(float32x4_t *)v2385 = v1053; + vst1q_f32((float32_t *)v2358, v1029); + vst1q_f32((float32_t *)v2367, v1037); + vst1q_f32((float32_t *)v2376, v1045); + vst1q_f32((float32_t *)v2385, v1053); float32x4_t v995 = vaddq_f32(v994, v978); float32x4_t v997 = vaddq_f32(v996, v980); float32x4_t v1093 = vaddq_f32(v989, v1001); float32x4_t v1101 = vsubq_f32(v989, v1001); float32x4_t v1109 = vaddq_f32(v988, v1000); float32x4_t v1117 = vsubq_f32(v988, v1000); - *(float32x4_t *)v2394 = v1061; - *(float32x4_t *)v2403 = v1069; - *(float32x4_t *)v2412 = v1077; - *(float32x4_t *)v2421 = v1085; - *(float32x4_t *)v2466 = v1125; - *(float32x4_t *)v2475 = v1133; + vst1q_f32((float32_t *)v2394, v1061); + vst1q_f32((float32_t *)v2403, v1069); + vst1q_f32((float32_t *)v2412, v1077); + vst1q_f32((float32_t *)v2421, v1085); + vst1q_f32((float32_t *)v2466, v1125); + vst1q_f32((float32_t *)v2475, v1133); float32x4_t v1013 = vaddq_f32(v983, v995); float32x4_t v1021 = vsubq_f32(v983, v995); float32x4_t v1141 = vaddq_f32(v985, v997); float32x4_t v1149 = vsubq_f32(v985, v997); - *(float32x4_t *)v2430 = v1093; - *(float32x4_t *)v2439 = v1101; - *(float32x4_t *)v2448 = v1109; - *(float32x4_t *)v2457 = v1117; - *(float32x4_t *)v2340 = v1013; - *(float32x4_t *)v2349 = v1021; - *(float32x4_t *)v2484 = v1141; - *(float32x4_t *)v2493 = v1149; + vst1q_f32((float32_t *)v2430, v1093); + vst1q_f32((float32_t *)v2439, v1101); + vst1q_f32((float32_t *)v2448, v1109); + vst1q_f32((float32_t *)v2457, v1117); + vst1q_f32((float32_t *)v2340, v1013); + vst1q_f32((float32_t *)v2349, v1021); + vst1q_f32((float32_t *)v2484, v1141); + vst1q_f32((float32_t *)v2493, v1149); v5 += 2 * 1; v6 += 2 * 1; } @@ -14327,7 +14327,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu20(const armral_cmplx_f32_t *restrict x, float32x2_t v859 = (float32x2_t){v858, v858}; const float32x2_t *v2067 = &v5[0]; float32x2_t *v2077 = &v6[0]; - float32x4_t v2286 = *(const float32x4_t *)v2046; + float32x4_t v2286 = vld1q_f32((const float32_t *)v2046); float32x4_t v47 = vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[18])); float32x4_t v49 = @@ -14453,7 +14453,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu20(const armral_cmplx_f32_t *restrict x, float32x2_t *v2230 = &v6[ostride * 9]; float32x2_t *v2239 = &v6[ostride * 14]; float32x2_t *v2248 = &v6[ostride * 19]; - float32x4_t v2290 = *(const float32x4_t *)v2067; + float32x4_t v2290 = vld1q_f32((const float32_t *)v2067); float32x4_t v594 = vmulq_f32(v588, v593); float32x4_t v788 = vcombine_f32(v786, v786); float32x4_t v796 = vcombine_f32(v794, v794); @@ -14461,24 +14461,24 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu20(const armral_cmplx_f32_t *restrict x, float32x4_t v829 = vcombine_f32(v827, v827); float32x4_t v837 = vcombine_f32(v835, v835); float32x4_t v845 = vcombine_f32(v843, v843); - float32x4_t v2252 = *(const float32x4_t *)v1859; - float32x4_t v2254 = *(const float32x4_t *)v1870; - float32x4_t v2256 = *(const float32x4_t *)v1880; - float32x4_t v2258 = *(const float32x4_t *)v1892; - float32x4_t v2260 = *(const float32x4_t *)v1902; - float32x4_t v2262 = *(const float32x4_t *)v1914; - float32x4_t v2264 = *(const float32x4_t *)v1924; - float32x4_t v2266 = *(const float32x4_t *)v1936; - float32x4_t v2268 = *(const float32x4_t *)v1946; - float32x4_t v2270 = *(const float32x4_t *)v1958; - float32x4_t v2272 = *(const float32x4_t *)v1968; - float32x4_t v2274 = *(const float32x4_t *)v1980; - float32x4_t v2276 = *(const float32x4_t *)v1990; - float32x4_t v2278 = *(const float32x4_t *)v2002; - float32x4_t v2280 = *(const float32x4_t *)v2012; - float32x4_t v2282 = *(const float32x4_t *)v2024; - float32x4_t v2284 = *(const float32x4_t *)v2034; - float32x4_t v2288 = *(const float32x4_t *)v2055; + float32x4_t v2252 = vld1q_f32((const float32_t *)v1859); + float32x4_t v2254 = vld1q_f32((const float32_t *)v1870); + float32x4_t v2256 = vld1q_f32((const float32_t *)v1880); + float32x4_t v2258 = vld1q_f32((const float32_t *)v1892); + float32x4_t v2260 = vld1q_f32((const float32_t *)v1902); + float32x4_t v2262 = vld1q_f32((const float32_t *)v1914); + float32x4_t v2264 = vld1q_f32((const float32_t *)v1924); + float32x4_t v2266 = vld1q_f32((const float32_t *)v1936); + float32x4_t v2268 = vld1q_f32((const float32_t *)v1946); + float32x4_t v2270 = vld1q_f32((const float32_t *)v1958); + float32x4_t v2272 = vld1q_f32((const float32_t *)v1968); + float32x4_t v2274 = vld1q_f32((const float32_t *)v1980); + float32x4_t v2276 = vld1q_f32((const float32_t *)v1990); + float32x4_t v2278 = vld1q_f32((const float32_t *)v2002); + float32x4_t v2280 = vld1q_f32((const float32_t *)v2012); + float32x4_t v2282 = vld1q_f32((const float32_t *)v2024); + float32x4_t v2284 = vld1q_f32((const float32_t *)v2034); + float32x4_t v2288 = vld1q_f32((const float32_t *)v2055); float32x4_t v42 = vtrn1q_f32(v2252, v2252); float32x4_t v43 = vtrn2q_f32(v2252, v2252); float32x4_t v92 = vtrn1q_f32(v2254, v2254); @@ -14656,8 +14656,8 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu20(const armral_cmplx_f32_t *restrict x, float32x4_t v809 = vsubq_f32(v789, v797); float32x4_t v810 = vaddq_f32(v797, v805); float32x4_t v830 = vmulq_f32(v828, v829); - *(float32x4_t *)v2077 = v654; - *(float32x4_t *)v2095 = v710; + vst1q_f32((float32_t *)v2077, v654); + vst1q_f32((float32_t *)v2095, v710); float32x4_t v695 = vaddq_f32(v694, v669); float32x4_t v696 = vsubq_f32(v694, v669); float32x4_t v697 = vsubq_f32(v677, v685); @@ -14683,20 +14683,20 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu20(const armral_cmplx_f32_t *restrict x, float32x4_t v758 = vsubq_f32(v752, v754); float32x4_t v863 = vaddq_f32(v862, v846); float32x4_t v864 = vsubq_f32(v862, v846); - *(float32x4_t *)v2086 = v872; - *(float32x4_t *)v2104 = v871; + vst1q_f32((float32_t *)v2086, v872); + vst1q_f32((float32_t *)v2104, v871); float32x4_t v867 = vaddq_f32(v863, v865); float32x4_t v868 = vsubq_f32(v863, v865); float32x4_t v869 = vaddq_f32(v864, v866); float32x4_t v870 = vsubq_f32(v864, v866); - *(float32x4_t *)v2113 = v700; - *(float32x4_t *)v2131 = v756; - *(float32x4_t *)v2149 = v702; - *(float32x4_t *)v2167 = v758; - *(float32x4_t *)v2185 = v701; - *(float32x4_t *)v2203 = v757; - *(float32x4_t *)v2221 = v699; - *(float32x4_t *)v2239 = v755; + vst1q_f32((float32_t *)v2113, v700); + vst1q_f32((float32_t *)v2131, v756); + vst1q_f32((float32_t *)v2149, v702); + vst1q_f32((float32_t *)v2167, v758); + vst1q_f32((float32_t *)v2185, v701); + vst1q_f32((float32_t *)v2203, v757); + vst1q_f32((float32_t *)v2221, v699); + vst1q_f32((float32_t *)v2239, v755); float32x4_t v901 = vaddq_f32(v812, v868); float32x4_t v902 = vsubq_f32(v812, v868); float32x4_t v931 = vaddq_f32(v814, v870); @@ -14705,14 +14705,14 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu20(const armral_cmplx_f32_t *restrict x, float32x4_t v962 = vsubq_f32(v813, v869); float32x4_t v991 = vaddq_f32(v811, v867); float32x4_t v992 = vsubq_f32(v811, v867); - *(float32x4_t *)v2122 = v902; - *(float32x4_t *)v2140 = v901; - *(float32x4_t *)v2158 = v932; - *(float32x4_t *)v2176 = v931; - *(float32x4_t *)v2194 = v962; - *(float32x4_t *)v2212 = v961; - *(float32x4_t *)v2230 = v992; - *(float32x4_t *)v2248 = v991; + vst1q_f32((float32_t *)v2122, v902); + vst1q_f32((float32_t *)v2140, v901); + vst1q_f32((float32_t *)v2158, v932); + vst1q_f32((float32_t *)v2176, v931); + vst1q_f32((float32_t *)v2194, v962); + vst1q_f32((float32_t *)v2212, v961); + vst1q_f32((float32_t *)v2230, v992); + vst1q_f32((float32_t *)v2248, v991); v5 += 2 * 1; v6 += 2 * 1; } @@ -15697,7 +15697,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu21(const armral_cmplx_f32_t *restrict x, float32x2_t v857 = (float32x2_t){v856, v856}; const float32x2_t *v2125 = &v5[0]; float32x2_t *v2135 = &v6[0]; - float32x4_t v2347 = *(const float32x4_t *)v2062; + float32x4_t v2347 = vld1q_f32((const float32_t *)v2062); float32x4_t v66 = vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[12])); float32x4_t v68 = @@ -15844,7 +15844,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu21(const armral_cmplx_f32_t *restrict x, float32x2_t *v2297 = &v6[ostride * 6]; float32x2_t *v2306 = &v6[ostride * 13]; float32x2_t *v2315 = &v6[ostride * 20]; - float32x4_t v2359 = *(const float32x4_t *)v2125; + float32x4_t v2359 = vld1q_f32((const float32_t *)v2125); float32x4_t v453 = vmulq_f32(v447, v452); float32x4_t v645 = vcombine_f32(v643, v643); float32x4_t v653 = vcombine_f32(v651, v651); @@ -15859,25 +15859,25 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu21(const armral_cmplx_f32_t *restrict x, float32x4_t v822 = vcombine_f32(v820, v820); float32x4_t v830 = vcombine_f32(v828, v828); float32x4_t v838 = vcombine_f32(v836, v836); - float32x4_t v2319 = *(const float32x4_t *)v1912; - float32x4_t v2321 = *(const float32x4_t *)v1922; - float32x4_t v2323 = *(const float32x4_t *)v1934; - float32x4_t v2325 = *(const float32x4_t *)v1944; - float32x4_t v2327 = *(const float32x4_t *)v1956; - float32x4_t v2329 = *(const float32x4_t *)v1966; - float32x4_t v2331 = *(const float32x4_t *)v1976; - float32x4_t v2333 = *(const float32x4_t *)v1988; - float32x4_t v2335 = *(const float32x4_t *)v1998; - float32x4_t v2337 = *(const float32x4_t *)v2008; - float32x4_t v2339 = *(const float32x4_t *)v2020; - float32x4_t v2341 = *(const float32x4_t *)v2030; - float32x4_t v2343 = *(const float32x4_t *)v2040; - float32x4_t v2345 = *(const float32x4_t *)v2052; - float32x4_t v2349 = *(const float32x4_t *)v2071; - float32x4_t v2351 = *(const float32x4_t *)v2082; - float32x4_t v2353 = *(const float32x4_t *)v2092; - float32x4_t v2355 = *(const float32x4_t *)v2102; - float32x4_t v2357 = *(const float32x4_t *)v2114; + float32x4_t v2319 = vld1q_f32((const float32_t *)v1912); + float32x4_t v2321 = vld1q_f32((const float32_t *)v1922); + float32x4_t v2323 = vld1q_f32((const float32_t *)v1934); + float32x4_t v2325 = vld1q_f32((const float32_t *)v1944); + float32x4_t v2327 = vld1q_f32((const float32_t *)v1956); + float32x4_t v2329 = vld1q_f32((const float32_t *)v1966); + float32x4_t v2331 = vld1q_f32((const float32_t *)v1976); + float32x4_t v2333 = vld1q_f32((const float32_t *)v1988); + float32x4_t v2335 = vld1q_f32((const float32_t *)v1998); + float32x4_t v2337 = vld1q_f32((const float32_t *)v2008); + float32x4_t v2339 = vld1q_f32((const float32_t *)v2020); + float32x4_t v2341 = vld1q_f32((const float32_t *)v2030); + float32x4_t v2343 = vld1q_f32((const float32_t *)v2040); + float32x4_t v2345 = vld1q_f32((const float32_t *)v2052); + float32x4_t v2349 = vld1q_f32((const float32_t *)v2071); + float32x4_t v2351 = vld1q_f32((const float32_t *)v2082); + float32x4_t v2353 = vld1q_f32((const float32_t *)v2092); + float32x4_t v2355 = vld1q_f32((const float32_t *)v2102); + float32x4_t v2357 = vld1q_f32((const float32_t *)v2114); float32x4_t v61 = vtrn1q_f32(v2319, v2319); float32x4_t v62 = vtrn2q_f32(v2319, v2319); float32x4_t v73 = vtrn1q_f32(v2321, v2321); @@ -16078,7 +16078,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu21(const armral_cmplx_f32_t *restrict x, float32x4_t v870 = vsubq_f32(v869, v859); float32x4_t v872 = vaddq_f32(v871, v859); float32x4_t v879 = vaddq_f32(v605, v711); - *(float32x4_t *)v2135 = v605; + vst1q_f32((float32_t *)v2135, v605); float32x4_t v672 = vaddq_f32(v671, v628); float32x4_t v674 = vsubq_f32(v671, v628); float32x4_t v676 = vsubq_f32(v671, v633); @@ -16106,8 +16106,8 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu21(const armral_cmplx_f32_t *restrict x, float32x4_t v861 = vaddq_f32(v860, v823); float32x4_t v863 = vsubq_f32(v860, v823); float32x4_t v865 = vsubq_f32(v860, v831); - *(float32x4_t *)v2144 = v881; - *(float32x4_t *)v2153 = v880; + vst1q_f32((float32_t *)v2144, v881); + vst1q_f32((float32_t *)v2153, v880); float32x4_t v684 = vaddq_f32(v673, v679); float32x4_t v685 = vsubq_f32(v673, v679); float32x4_t v686 = vaddq_f32(v675, v681); @@ -16135,12 +16135,12 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu21(const armral_cmplx_f32_t *restrict x, float32x4_t v975 = vaddq_f32(v689, v782); float32x4_t v999 = vaddq_f32(v686, v779); float32x4_t v1023 = vaddq_f32(v684, v777); - *(float32x4_t *)v2162 = v685; - *(float32x4_t *)v2189 = v687; - *(float32x4_t *)v2216 = v688; - *(float32x4_t *)v2243 = v689; - *(float32x4_t *)v2270 = v686; - *(float32x4_t *)v2297 = v684; + vst1q_f32((float32_t *)v2162, v685); + vst1q_f32((float32_t *)v2189, v687); + vst1q_f32((float32_t *)v2216, v688); + vst1q_f32((float32_t *)v2243, v689); + vst1q_f32((float32_t *)v2270, v686); + vst1q_f32((float32_t *)v2297, v684); float32x4_t v904 = vaddq_f32(v903, v874); float32x4_t v905 = vsubq_f32(v903, v874); float32x4_t v928 = vaddq_f32(v927, v876); @@ -16153,18 +16153,18 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu21(const armral_cmplx_f32_t *restrict x, float32x4_t v1001 = vsubq_f32(v999, v875); float32x4_t v1024 = vaddq_f32(v1023, v873); float32x4_t v1025 = vsubq_f32(v1023, v873); - *(float32x4_t *)v2171 = v905; - *(float32x4_t *)v2180 = v904; - *(float32x4_t *)v2198 = v929; - *(float32x4_t *)v2207 = v928; - *(float32x4_t *)v2225 = v953; - *(float32x4_t *)v2234 = v952; - *(float32x4_t *)v2252 = v977; - *(float32x4_t *)v2261 = v976; - *(float32x4_t *)v2279 = v1001; - *(float32x4_t *)v2288 = v1000; - *(float32x4_t *)v2306 = v1025; - *(float32x4_t *)v2315 = v1024; + vst1q_f32((float32_t *)v2171, v905); + vst1q_f32((float32_t *)v2180, v904); + vst1q_f32((float32_t *)v2198, v929); + vst1q_f32((float32_t *)v2207, v928); + vst1q_f32((float32_t *)v2225, v953); + vst1q_f32((float32_t *)v2234, v952); + vst1q_f32((float32_t *)v2252, v977); + vst1q_f32((float32_t *)v2261, v976); + vst1q_f32((float32_t *)v2279, v1001); + vst1q_f32((float32_t *)v2288, v1000); + vst1q_f32((float32_t *)v2306, v1025); + vst1q_f32((float32_t *)v2315, v1024); v5 += 2 * 1; v6 += 2 * 1; } @@ -17361,7 +17361,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu22(const armral_cmplx_f32_t *restrict x, float32x2_t v1086 = (float32x2_t){v1084, v1085}; const float32x2_t *v2599 = &v5[0]; float32x2_t *v2609 = &v6[0]; - float32x4_t v2826 = *(const float32x4_t *)v2500; + float32x4_t v2826 = vld1q_f32((const float32_t *)v2500); float32x4_t v47 = vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[20])); float32x4_t v49 = @@ -17508,7 +17508,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu22(const armral_cmplx_f32_t *restrict x, float32x2_t *v2780 = &v6[ostride * 9]; float32x2_t *v2789 = &v6[ostride * 10]; float32x2_t *v2798 = &v6[ostride * 21]; - float32x4_t v2844 = *(const float32x4_t *)v2599; + float32x4_t v2844 = vld1q_f32((const float32_t *)v2599); float32x4_t v420 = vmulq_f32(v414, v419); float32x4_t v973 = vcombine_f32(v971, v971); float32x4_t v1026 = vcombine_f32(v1024, v1024); @@ -17520,26 +17520,26 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu22(const armral_cmplx_f32_t *restrict x, float32x4_t v1074 = vcombine_f32(v1072, v1072); float32x4_t v1082 = vcombine_f32(v1080, v1080); float32x4_t v1090 = vcombine_f32(v1088, v1088); - float32x4_t v2802 = *(const float32x4_t *)v2369; - float32x4_t v2804 = *(const float32x4_t *)v2380; - float32x4_t v2806 = *(const float32x4_t *)v2390; - float32x4_t v2808 = *(const float32x4_t *)v2402; - float32x4_t v2810 = *(const float32x4_t *)v2412; - float32x4_t v2812 = *(const float32x4_t *)v2424; - float32x4_t v2814 = *(const float32x4_t *)v2434; - float32x4_t v2816 = *(const float32x4_t *)v2446; - float32x4_t v2818 = *(const float32x4_t *)v2456; - float32x4_t v2820 = *(const float32x4_t *)v2468; - float32x4_t v2822 = *(const float32x4_t *)v2478; - float32x4_t v2824 = *(const float32x4_t *)v2490; - float32x4_t v2828 = *(const float32x4_t *)v2510; - float32x4_t v2830 = *(const float32x4_t *)v2520; - float32x4_t v2832 = *(const float32x4_t *)v2532; - float32x4_t v2834 = *(const float32x4_t *)v2542; - float32x4_t v2836 = *(const float32x4_t *)v2554; - float32x4_t v2838 = *(const float32x4_t *)v2564; - float32x4_t v2840 = *(const float32x4_t *)v2576; - float32x4_t v2842 = *(const float32x4_t *)v2586; + float32x4_t v2802 = vld1q_f32((const float32_t *)v2369); + float32x4_t v2804 = vld1q_f32((const float32_t *)v2380); + float32x4_t v2806 = vld1q_f32((const float32_t *)v2390); + float32x4_t v2808 = vld1q_f32((const float32_t *)v2402); + float32x4_t v2810 = vld1q_f32((const float32_t *)v2412); + float32x4_t v2812 = vld1q_f32((const float32_t *)v2424); + float32x4_t v2814 = vld1q_f32((const float32_t *)v2434); + float32x4_t v2816 = vld1q_f32((const float32_t *)v2446); + float32x4_t v2818 = vld1q_f32((const float32_t *)v2456); + float32x4_t v2820 = vld1q_f32((const float32_t *)v2468); + float32x4_t v2822 = vld1q_f32((const float32_t *)v2478); + float32x4_t v2824 = vld1q_f32((const float32_t *)v2490); + float32x4_t v2828 = vld1q_f32((const float32_t *)v2510); + float32x4_t v2830 = vld1q_f32((const float32_t *)v2520); + float32x4_t v2832 = vld1q_f32((const float32_t *)v2532); + float32x4_t v2834 = vld1q_f32((const float32_t *)v2542); + float32x4_t v2836 = vld1q_f32((const float32_t *)v2554); + float32x4_t v2838 = vld1q_f32((const float32_t *)v2564); + float32x4_t v2840 = vld1q_f32((const float32_t *)v2576); + float32x4_t v2842 = vld1q_f32((const float32_t *)v2586); float32x4_t v42 = vtrn1q_f32(v2802, v2802); float32x4_t v43 = vtrn2q_f32(v2802, v2802); float32x4_t v92 = vtrn1q_f32(v2804, v2804); @@ -17811,8 +17811,8 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu22(const armral_cmplx_f32_t *restrict x, float32x4_t v1106 = vsubq_f32(v1075, v1091); float32x4_t v1107 = vaddq_f32(v1097, v1099); float32x4_t v1125 = vaddq_f32(v1101, v1102); - *(float32x4_t *)v2609 = v717; - *(float32x4_t *)v2618 = v936; + vst1q_f32((float32_t *)v2609, v717); + vst1q_f32((float32_t *)v2618, v936); float32x4_t v889 = vaddq_f32(v888, v873); float32x4_t v890 = vsubq_f32(v873, v875); float32x4_t v892 = vaddq_f32(v873, v879); @@ -17873,26 +17873,26 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu22(const armral_cmplx_f32_t *restrict x, float32x4_t v919 = vsubq_f32(v897, v909); float32x4_t v1129 = vaddq_f32(v1116, v1128); float32x4_t v1138 = vsubq_f32(v1116, v1128); - *(float32x4_t *)v2645 = v918; - *(float32x4_t *)v2654 = v1137; - *(float32x4_t *)v2663 = v917; - *(float32x4_t *)v2672 = v1136; - *(float32x4_t *)v2681 = v916; - *(float32x4_t *)v2690 = v1135; - *(float32x4_t *)v2699 = v915; - *(float32x4_t *)v2708 = v1134; - *(float32x4_t *)v2717 = v914; - *(float32x4_t *)v2726 = v1133; - *(float32x4_t *)v2735 = v913; - *(float32x4_t *)v2744 = v1132; - *(float32x4_t *)v2753 = v912; - *(float32x4_t *)v2762 = v1131; - *(float32x4_t *)v2771 = v911; - *(float32x4_t *)v2780 = v1130; - *(float32x4_t *)v2627 = v919; - *(float32x4_t *)v2636 = v1138; - *(float32x4_t *)v2789 = v910; - *(float32x4_t *)v2798 = v1129; + vst1q_f32((float32_t *)v2645, v918); + vst1q_f32((float32_t *)v2654, v1137); + vst1q_f32((float32_t *)v2663, v917); + vst1q_f32((float32_t *)v2672, v1136); + vst1q_f32((float32_t *)v2681, v916); + vst1q_f32((float32_t *)v2690, v1135); + vst1q_f32((float32_t *)v2699, v915); + vst1q_f32((float32_t *)v2708, v1134); + vst1q_f32((float32_t *)v2717, v914); + vst1q_f32((float32_t *)v2726, v1133); + vst1q_f32((float32_t *)v2735, v913); + vst1q_f32((float32_t *)v2744, v1132); + vst1q_f32((float32_t *)v2753, v912); + vst1q_f32((float32_t *)v2762, v1131); + vst1q_f32((float32_t *)v2771, v911); + vst1q_f32((float32_t *)v2780, v1130); + vst1q_f32((float32_t *)v2627, v919); + vst1q_f32((float32_t *)v2636, v1138); + vst1q_f32((float32_t *)v2789, v910); + vst1q_f32((float32_t *)v2798, v1129); v5 += 2 * 1; v6 += 2 * 1; } @@ -19235,7 +19235,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu24(const armral_cmplx_f32_t *restrict x, float32x2_t v896 = (float32x2_t){v894, v895}; const float32x2_t *v2252 = &v5[0]; float32x2_t *v2262 = &v6[0]; - float32x4_t v2491 = *(const float32x4_t *)v2103; + float32x4_t v2491 = vld1q_f32((const float32_t *)v2103); float32x4_t v66 = vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[14])); float32x4_t v68 = @@ -19385,7 +19385,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu24(const armral_cmplx_f32_t *restrict x, float32x2_t *v2451 = &v6[ostride * 15]; float32x2_t *v2460 = &v6[ostride * 7]; float32x2_t *v2469 = &v6[ostride * 23]; - float32x4_t v2519 = *(const float32x4_t *)v2252; + float32x4_t v2519 = vld1q_f32((const float32_t *)v2252); float32x4_t v303 = vmulq_f32(v297, v302); float32x4_t v731 = vcombine_f32(v729, v729); float32x4_t v739 = vcombine_f32(v737, v737); @@ -19393,28 +19393,28 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu24(const armral_cmplx_f32_t *restrict x, float32x4_t v814 = vcombine_f32(v812, v812); float32x4_t v882 = vcombine_f32(v880, v880); float32x4_t v900 = vcombine_f32(v898, v898); - float32x4_t v2473 = *(const float32x4_t *)v2007; - float32x4_t v2475 = *(const float32x4_t *)v2017; - float32x4_t v2477 = *(const float32x4_t *)v2029; - float32x4_t v2479 = *(const float32x4_t *)v2039; - float32x4_t v2481 = *(const float32x4_t *)v2051; - float32x4_t v2483 = *(const float32x4_t *)v2061; - float32x4_t v2485 = *(const float32x4_t *)v2071; - float32x4_t v2487 = *(const float32x4_t *)v2083; - float32x4_t v2489 = *(const float32x4_t *)v2093; - float32x4_t v2493 = *(const float32x4_t *)v2113; - float32x4_t v2495 = *(const float32x4_t *)v2123; - float32x4_t v2497 = *(const float32x4_t *)v2133; - float32x4_t v2499 = *(const float32x4_t *)v2145; - float32x4_t v2501 = *(const float32x4_t *)v2155; - float32x4_t v2503 = *(const float32x4_t *)v2165; - float32x4_t v2505 = *(const float32x4_t *)v2177; - float32x4_t v2507 = *(const float32x4_t *)v2187; - float32x4_t v2509 = *(const float32x4_t *)v2197; - float32x4_t v2511 = *(const float32x4_t *)v2209; - float32x4_t v2513 = *(const float32x4_t *)v2219; - float32x4_t v2515 = *(const float32x4_t *)v2229; - float32x4_t v2517 = *(const float32x4_t *)v2241; + float32x4_t v2473 = vld1q_f32((const float32_t *)v2007); + float32x4_t v2475 = vld1q_f32((const float32_t *)v2017); + float32x4_t v2477 = vld1q_f32((const float32_t *)v2029); + float32x4_t v2479 = vld1q_f32((const float32_t *)v2039); + float32x4_t v2481 = vld1q_f32((const float32_t *)v2051); + float32x4_t v2483 = vld1q_f32((const float32_t *)v2061); + float32x4_t v2485 = vld1q_f32((const float32_t *)v2071); + float32x4_t v2487 = vld1q_f32((const float32_t *)v2083); + float32x4_t v2489 = vld1q_f32((const float32_t *)v2093); + float32x4_t v2493 = vld1q_f32((const float32_t *)v2113); + float32x4_t v2495 = vld1q_f32((const float32_t *)v2123); + float32x4_t v2497 = vld1q_f32((const float32_t *)v2133); + float32x4_t v2499 = vld1q_f32((const float32_t *)v2145); + float32x4_t v2501 = vld1q_f32((const float32_t *)v2155); + float32x4_t v2503 = vld1q_f32((const float32_t *)v2165); + float32x4_t v2505 = vld1q_f32((const float32_t *)v2177); + float32x4_t v2507 = vld1q_f32((const float32_t *)v2187); + float32x4_t v2509 = vld1q_f32((const float32_t *)v2197); + float32x4_t v2511 = vld1q_f32((const float32_t *)v2209); + float32x4_t v2513 = vld1q_f32((const float32_t *)v2219); + float32x4_t v2515 = vld1q_f32((const float32_t *)v2229); + float32x4_t v2517 = vld1q_f32((const float32_t *)v2241); float32x4_t v61 = vtrn1q_f32(v2473, v2473); float32x4_t v62 = vtrn2q_f32(v2473, v2473); float32x4_t v73 = vtrn1q_f32(v2475, v2475); @@ -19623,8 +19623,8 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu24(const armral_cmplx_f32_t *restrict x, float32x4_t v905 = vsubq_f32(v883, v901); float32x4_t v912 = vaddq_f32(v693, v776); float32x4_t v1008 = vaddq_f32(v694, v781); - *(float32x4_t *)v2262 = v693; - *(float32x4_t *)v2370 = v694; + vst1q_f32((float32_t *)v2262, v693); + vst1q_f32((float32_t *)v2370, v694); float32x4_t v746 = vaddq_f32(v690, v719); float32x4_t v747 = vsubq_f32(v690, v719); float32x4_t v750 = vaddq_f32(v732, v740); @@ -19647,12 +19647,12 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu24(const armral_cmplx_f32_t *restrict x, float32x4_t v755 = vsubq_f32(v749, v751); float32x4_t v960 = vaddq_f32(v747, v822); float32x4_t v1056 = vaddq_f32(v746, v821); - *(float32x4_t *)v2271 = v914; - *(float32x4_t *)v2280 = v913; - *(float32x4_t *)v2316 = v747; - *(float32x4_t *)v2379 = v1010; - *(float32x4_t *)v2388 = v1009; - *(float32x4_t *)v2424 = v746; + vst1q_f32((float32_t *)v2271, v914); + vst1q_f32((float32_t *)v2280, v913); + vst1q_f32((float32_t *)v2316, v747); + vst1q_f32((float32_t *)v2379, v1010); + vst1q_f32((float32_t *)v2388, v1009); + vst1q_f32((float32_t *)v2424, v746); float32x4_t v936 = vaddq_f32(v753, v828); float32x4_t v961 = vaddq_f32(v960, v903); float32x4_t v962 = vsubq_f32(v960, v903); @@ -19661,10 +19661,10 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu24(const armral_cmplx_f32_t *restrict x, float32x4_t v1057 = vaddq_f32(v1056, v902); float32x4_t v1058 = vsubq_f32(v1056, v902); float32x4_t v1080 = vaddq_f32(v752, v827); - *(float32x4_t *)v2289 = v753; - *(float32x4_t *)v2343 = v754; - *(float32x4_t *)v2397 = v755; - *(float32x4_t *)v2451 = v752; + vst1q_f32((float32_t *)v2289, v753); + vst1q_f32((float32_t *)v2343, v754); + vst1q_f32((float32_t *)v2397, v755); + vst1q_f32((float32_t *)v2451, v752); float32x4_t v937 = vaddq_f32(v936, v909); float32x4_t v938 = vsubq_f32(v936, v909); float32x4_t v985 = vaddq_f32(v984, v910); @@ -19673,18 +19673,18 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu24(const armral_cmplx_f32_t *restrict x, float32x4_t v1034 = vsubq_f32(v1032, v911); float32x4_t v1081 = vaddq_f32(v1080, v908); float32x4_t v1082 = vsubq_f32(v1080, v908); - *(float32x4_t *)v2325 = v962; - *(float32x4_t *)v2334 = v961; - *(float32x4_t *)v2433 = v1058; - *(float32x4_t *)v2442 = v1057; - *(float32x4_t *)v2298 = v938; - *(float32x4_t *)v2307 = v937; - *(float32x4_t *)v2352 = v986; - *(float32x4_t *)v2361 = v985; - *(float32x4_t *)v2406 = v1034; - *(float32x4_t *)v2415 = v1033; - *(float32x4_t *)v2460 = v1082; - *(float32x4_t *)v2469 = v1081; + vst1q_f32((float32_t *)v2325, v962); + vst1q_f32((float32_t *)v2334, v961); + vst1q_f32((float32_t *)v2433, v1058); + vst1q_f32((float32_t *)v2442, v1057); + vst1q_f32((float32_t *)v2298, v938); + vst1q_f32((float32_t *)v2307, v937); + vst1q_f32((float32_t *)v2352, v986); + vst1q_f32((float32_t *)v2361, v985); + vst1q_f32((float32_t *)v2406, v1034); + vst1q_f32((float32_t *)v2415, v1033); + vst1q_f32((float32_t *)v2460, v1082); + vst1q_f32((float32_t *)v2469, v1081); v5 += 2 * 1; v6 += 2 * 1; } @@ -20787,7 +20787,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1978 = (float32x2_t){v1977, v1977}; const float32x2_t *v3877 = &v5[0]; float32x2_t *v3887 = &v6[0]; - float32x4_t v4115 = *(const float32x4_t *)v3677; + float32x4_t v4115 = vld1q_f32((const float32_t *)v3677); float32x4_t v35 = vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[8])); float32x4_t v37 = @@ -20955,7 +20955,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t *v4085 = &v6[ostride * 14]; float32x2_t *v4094 = &v6[ostride * 19]; float32x2_t *v4103 = &v6[ostride * 24]; - float32x4_t v4155 = *(const float32x4_t *)v3877; + float32x4_t v4155 = vld1q_f32((const float32_t *)v3877); float32x4_t v112 = vmulq_f32(v106, v111); float32x4_t v1209 = vcombine_f32(v1205, v1205); float32x4_t v1328 = vcombine_f32(v1326, v1326); @@ -20968,29 +20968,29 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1860 = vcombine_f32(v1858, v1858); float32x4_t v1881 = vcombine_f32(v1879, v1879); float32x4_t v1953 = vcombine_f32(v1951, v1951); - float32x4_t v4107 = *(const float32x4_t *)v3637; - float32x4_t v4109 = *(const float32x4_t *)v3647; - float32x4_t v4111 = *(const float32x4_t *)v3657; - float32x4_t v4113 = *(const float32x4_t *)v3667; - float32x4_t v4117 = *(const float32x4_t *)v3686; - float32x4_t v4119 = *(const float32x4_t *)v3696; - float32x4_t v4121 = *(const float32x4_t *)v3706; - float32x4_t v4123 = *(const float32x4_t *)v3716; - float32x4_t v4125 = *(const float32x4_t *)v3726; - float32x4_t v4127 = *(const float32x4_t *)v3736; - float32x4_t v4129 = *(const float32x4_t *)v3746; - float32x4_t v4131 = *(const float32x4_t *)v3756; - float32x4_t v4133 = *(const float32x4_t *)v3766; - float32x4_t v4135 = *(const float32x4_t *)v3776; - float32x4_t v4137 = *(const float32x4_t *)v3786; - float32x4_t v4139 = *(const float32x4_t *)v3796; - float32x4_t v4141 = *(const float32x4_t *)v3806; - float32x4_t v4143 = *(const float32x4_t *)v3816; - float32x4_t v4145 = *(const float32x4_t *)v3826; - float32x4_t v4147 = *(const float32x4_t *)v3836; - float32x4_t v4149 = *(const float32x4_t *)v3846; - float32x4_t v4151 = *(const float32x4_t *)v3856; - float32x4_t v4153 = *(const float32x4_t *)v3866; + float32x4_t v4107 = vld1q_f32((const float32_t *)v3637); + float32x4_t v4109 = vld1q_f32((const float32_t *)v3647); + float32x4_t v4111 = vld1q_f32((const float32_t *)v3657); + float32x4_t v4113 = vld1q_f32((const float32_t *)v3667); + float32x4_t v4117 = vld1q_f32((const float32_t *)v3686); + float32x4_t v4119 = vld1q_f32((const float32_t *)v3696); + float32x4_t v4121 = vld1q_f32((const float32_t *)v3706); + float32x4_t v4123 = vld1q_f32((const float32_t *)v3716); + float32x4_t v4125 = vld1q_f32((const float32_t *)v3726); + float32x4_t v4127 = vld1q_f32((const float32_t *)v3736); + float32x4_t v4129 = vld1q_f32((const float32_t *)v3746); + float32x4_t v4131 = vld1q_f32((const float32_t *)v3756); + float32x4_t v4133 = vld1q_f32((const float32_t *)v3766); + float32x4_t v4135 = vld1q_f32((const float32_t *)v3776); + float32x4_t v4137 = vld1q_f32((const float32_t *)v3786); + float32x4_t v4139 = vld1q_f32((const float32_t *)v3796); + float32x4_t v4141 = vld1q_f32((const float32_t *)v3806); + float32x4_t v4143 = vld1q_f32((const float32_t *)v3816); + float32x4_t v4145 = vld1q_f32((const float32_t *)v3826); + float32x4_t v4147 = vld1q_f32((const float32_t *)v3836); + float32x4_t v4149 = vld1q_f32((const float32_t *)v3846); + float32x4_t v4151 = vld1q_f32((const float32_t *)v3856); + float32x4_t v4153 = vld1q_f32((const float32_t *)v3866); float32x4_t v30 = vtrn1q_f32(v4107, v4107); float32x4_t v31 = vtrn2q_f32(v4107, v4107); float32x4_t v49 = vtrn1q_f32(v4109, v4109); @@ -21385,7 +21385,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1848 = vfmaq_f32(v1847, v882, v1838); float32x4_t v1862 = vfmaq_f32(v1861, v1148, v1852); float32x4_t v1883 = vfmaq_f32(v1882, v1015, v1873); - *(float32x4_t *)v3887 = v1251; + vst1q_f32((float32_t *)v3887, v1251); float32x4_t v1238 = vsubq_f32(v1226, v1237); float32x4_t v1243 = vmulq_f32(v1226, v1979); float32x4_t v1365 = vsubq_f32(v1364, v1359); @@ -21444,8 +21444,8 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1892 = vsubq_f32(v1869, v1890); float32x4_t v1904 = vaddq_f32(v1863, v1903); float32x4_t v1922 = vsubq_f32(v1921, v1884); - *(float32x4_t *)v3905 = v1283; - *(float32x4_t *)v3977 = v1587; + vst1q_f32((float32_t *)v3905, v1283); + vst1q_f32((float32_t *)v3977, v1587); float32x4_t v1309 = vsubq_f32(v1308, v1267); float32x4_t v1394 = vsubq_f32(v595, v1393); float32x4_t v1434 = vmulq_f32(v1432, v1953); @@ -21460,10 +21460,10 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1923 = vaddq_f32(v616, v1891); float32x4_t v1936 = vrev64q_f32(v1904); float32x4_t v1952 = vrev64q_f32(v1922); - *(float32x4_t *)v3896 = v1267; - *(float32x4_t *)v3914 = v1296; - *(float32x4_t *)v3932 = v1419; - *(float32x4_t *)v4022 = v1755; + vst1q_f32((float32_t *)v3896, v1267); + vst1q_f32((float32_t *)v3914, v1296); + vst1q_f32((float32_t *)v3932, v1419); + vst1q_f32((float32_t *)v4022, v1755); float32x4_t v1406 = vsubq_f32(v1394, v1405); float32x4_t v1411 = vmulq_f32(v1394, v1979); float32x4_t v1580 = vsubq_f32(v1579, v1574); @@ -21474,8 +21474,8 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1898 = vsubq_f32(v616, v1897); float32x4_t v1938 = vmulq_f32(v1936, v1953); float32x4_t v1954 = vmulq_f32(v1952, v1953); - *(float32x4_t *)v3923 = v1309; - *(float32x4_t *)v4067 = v1923; + vst1q_f32((float32_t *)v3923, v1309); + vst1q_f32((float32_t *)v4067, v1923); float32x4_t v1412 = vsubq_f32(v1411, v1406); float32x4_t v1451 = vsubq_f32(v1406, v1450); float32x4_t v1463 = vmulq_f32(v1406, v1979); @@ -21487,7 +21487,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1799 = vmulq_f32(v1742, v1979); float32x4_t v1910 = vsubq_f32(v1898, v1909); float32x4_t v1915 = vmulq_f32(v1898, v1979); - *(float32x4_t *)v3995 = v1619; + vst1q_f32((float32_t *)v3995, v1619); float32x4_t v1435 = vsubq_f32(v1412, v1434); float32x4_t v1464 = vsubq_f32(v1463, v1451); float32x4_t v1476 = vmulq_f32(v1412, v1979); @@ -21498,27 +21498,27 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1916 = vsubq_f32(v1915, v1910); float32x4_t v1955 = vsubq_f32(v1910, v1954); float32x4_t v1967 = vmulq_f32(v1910, v1979); - *(float32x4_t *)v3950 = v1451; - *(float32x4_t *)v3986 = v1603; - *(float32x4_t *)v4004 = v1632; - *(float32x4_t *)v4040 = v1787; + vst1q_f32((float32_t *)v3950, v1451); + vst1q_f32((float32_t *)v3986, v1603); + vst1q_f32((float32_t *)v4004, v1632); + vst1q_f32((float32_t *)v4040, v1787); float32x4_t v1477 = vsubq_f32(v1476, v1435); float32x4_t v1813 = vsubq_f32(v1812, v1771); float32x4_t v1939 = vsubq_f32(v1916, v1938); float32x4_t v1968 = vsubq_f32(v1967, v1955); float32x4_t v1980 = vmulq_f32(v1916, v1979); - *(float32x4_t *)v3941 = v1435; - *(float32x4_t *)v3959 = v1464; - *(float32x4_t *)v4013 = v1645; - *(float32x4_t *)v4031 = v1771; - *(float32x4_t *)v4049 = v1800; - *(float32x4_t *)v4085 = v1955; + vst1q_f32((float32_t *)v3941, v1435); + vst1q_f32((float32_t *)v3959, v1464); + vst1q_f32((float32_t *)v4013, v1645); + vst1q_f32((float32_t *)v4031, v1771); + vst1q_f32((float32_t *)v4049, v1800); + vst1q_f32((float32_t *)v4085, v1955); float32x4_t v1981 = vsubq_f32(v1980, v1939); - *(float32x4_t *)v3968 = v1477; - *(float32x4_t *)v4058 = v1813; - *(float32x4_t *)v4076 = v1939; - *(float32x4_t *)v4094 = v1968; - *(float32x4_t *)v4103 = v1981; + vst1q_f32((float32_t *)v3968, v1477); + vst1q_f32((float32_t *)v4058, v1813); + vst1q_f32((float32_t *)v4076, v1939); + vst1q_f32((float32_t *)v4094, v1968); + vst1q_f32((float32_t *)v4103, v1981); v5 += 2 * 1; v6 += 2 * 1; } @@ -23061,7 +23061,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu32(const armral_cmplx_f32_t *restrict x, float32x2_t v1763 = (float32x2_t){v1761, v1762}; const float32x2_t *v3622 = &v5[0]; float32x2_t *v3632 = &v6[0]; - float32x4_t v3945 = *(const float32x4_t *)v3451; + float32x4_t v3945 = vld1q_f32((const float32_t *)v3451); float32x4_t v47 = vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[30])); float32x4_t v49 = @@ -23270,7 +23270,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu32(const armral_cmplx_f32_t *restrict x, float32x2_t *v3893 = &v6[ostride * 15]; float32x2_t *v3902 = &v6[ostride * 23]; float32x2_t *v3911 = &v6[ostride * 31]; - float32x4_t v3977 = *(const float32x4_t *)v3622; + float32x4_t v3977 = vld1q_f32((const float32_t *)v3622); float32x4_t v460 = vmulq_f32(v454, v459); float32x4_t v1392 = vcombine_f32(v1390, v1390); float32x4_t v1462 = vcombine_f32(v1460, v1460); @@ -23282,36 +23282,36 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu32(const armral_cmplx_f32_t *restrict x, float32x4_t v1742 = vcombine_f32(v1740, v1740); float32x4_t v1755 = vcombine_f32(v1753, v1753); float32x4_t v1767 = vcombine_f32(v1765, v1765); - float32x4_t v3915 = *(const float32x4_t *)v3292; - float32x4_t v3917 = *(const float32x4_t *)v3303; - float32x4_t v3919 = *(const float32x4_t *)v3313; - float32x4_t v3921 = *(const float32x4_t *)v3323; - float32x4_t v3923 = *(const float32x4_t *)v3333; - float32x4_t v3925 = *(const float32x4_t *)v3345; - float32x4_t v3927 = *(const float32x4_t *)v3355; - float32x4_t v3929 = *(const float32x4_t *)v3367; - float32x4_t v3931 = *(const float32x4_t *)v3377; - float32x4_t v3933 = *(const float32x4_t *)v3389; - float32x4_t v3935 = *(const float32x4_t *)v3399; - float32x4_t v3937 = *(const float32x4_t *)v3409; - float32x4_t v3939 = *(const float32x4_t *)v3419; - float32x4_t v3941 = *(const float32x4_t *)v3431; - float32x4_t v3943 = *(const float32x4_t *)v3441; - float32x4_t v3947 = *(const float32x4_t *)v3460; - float32x4_t v3949 = *(const float32x4_t *)v3471; - float32x4_t v3951 = *(const float32x4_t *)v3481; - float32x4_t v3953 = *(const float32x4_t *)v3491; - float32x4_t v3955 = *(const float32x4_t *)v3501; - float32x4_t v3957 = *(const float32x4_t *)v3513; - float32x4_t v3959 = *(const float32x4_t *)v3523; - float32x4_t v3961 = *(const float32x4_t *)v3535; - float32x4_t v3963 = *(const float32x4_t *)v3545; - float32x4_t v3965 = *(const float32x4_t *)v3557; - float32x4_t v3967 = *(const float32x4_t *)v3567; - float32x4_t v3969 = *(const float32x4_t *)v3577; - float32x4_t v3971 = *(const float32x4_t *)v3587; - float32x4_t v3973 = *(const float32x4_t *)v3599; - float32x4_t v3975 = *(const float32x4_t *)v3609; + float32x4_t v3915 = vld1q_f32((const float32_t *)v3292); + float32x4_t v3917 = vld1q_f32((const float32_t *)v3303); + float32x4_t v3919 = vld1q_f32((const float32_t *)v3313); + float32x4_t v3921 = vld1q_f32((const float32_t *)v3323); + float32x4_t v3923 = vld1q_f32((const float32_t *)v3333); + float32x4_t v3925 = vld1q_f32((const float32_t *)v3345); + float32x4_t v3927 = vld1q_f32((const float32_t *)v3355); + float32x4_t v3929 = vld1q_f32((const float32_t *)v3367); + float32x4_t v3931 = vld1q_f32((const float32_t *)v3377); + float32x4_t v3933 = vld1q_f32((const float32_t *)v3389); + float32x4_t v3935 = vld1q_f32((const float32_t *)v3399); + float32x4_t v3937 = vld1q_f32((const float32_t *)v3409); + float32x4_t v3939 = vld1q_f32((const float32_t *)v3419); + float32x4_t v3941 = vld1q_f32((const float32_t *)v3431); + float32x4_t v3943 = vld1q_f32((const float32_t *)v3441); + float32x4_t v3947 = vld1q_f32((const float32_t *)v3460); + float32x4_t v3949 = vld1q_f32((const float32_t *)v3471); + float32x4_t v3951 = vld1q_f32((const float32_t *)v3481); + float32x4_t v3953 = vld1q_f32((const float32_t *)v3491); + float32x4_t v3955 = vld1q_f32((const float32_t *)v3501); + float32x4_t v3957 = vld1q_f32((const float32_t *)v3513); + float32x4_t v3959 = vld1q_f32((const float32_t *)v3523); + float32x4_t v3961 = vld1q_f32((const float32_t *)v3535); + float32x4_t v3963 = vld1q_f32((const float32_t *)v3545); + float32x4_t v3965 = vld1q_f32((const float32_t *)v3557); + float32x4_t v3967 = vld1q_f32((const float32_t *)v3567); + float32x4_t v3969 = vld1q_f32((const float32_t *)v3577); + float32x4_t v3971 = vld1q_f32((const float32_t *)v3587); + float32x4_t v3973 = vld1q_f32((const float32_t *)v3599); + float32x4_t v3975 = vld1q_f32((const float32_t *)v3609); float32x4_t v42 = vtrn1q_f32(v3915, v3915); float32x4_t v43 = vtrn2q_f32(v3915, v3915); float32x4_t v61 = vtrn1q_f32(v3917, v3917); @@ -23593,8 +23593,8 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu32(const armral_cmplx_f32_t *restrict x, float32x4_t v1671 = vrev64q_f32(v1150); float32x4_t v1678 = vmulq_f32(v1226, v1677); float32x4_t v1684 = vrev64q_f32(v1226); - *(float32x4_t *)v3632 = v1279; - *(float32x4_t *)v3650 = v1280; + vst1q_f32((float32_t *)v3632, v1279); + vst1q_f32((float32_t *)v3650, v1280); float32x4_t v938 = vrev64q_f32(v932); float32x4_t v941 = vaddq_f32(v883, v931); float32x4_t v942 = vsubq_f32(v883, v931); @@ -23638,8 +23638,8 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu32(const armral_cmplx_f32_t *restrict x, float32x4_t v1614 = vrev64q_f32(v1266); float32x4_t v1687 = vfmaq_f32(v1665, v1671, v1672); float32x4_t v1688 = vfmaq_f32(v1678, v1684, v1685); - *(float32x4_t *)v3641 = v1281; - *(float32x4_t *)v3659 = v1282; + vst1q_f32((float32_t *)v3641, v1281); + vst1q_f32((float32_t *)v3659, v1282); float32x4_t v943 = vsubq_f32(v884, v940); float32x4_t v944 = vaddq_f32(v884, v940); float32x4_t v1026 = vrev64q_f32(v1020); @@ -23681,8 +23681,8 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu32(const armral_cmplx_f32_t *restrict x, float32x4_t v1741 = vrev64q_f32(v1192); float32x4_t v1748 = vmulq_f32(v1268, v1747); float32x4_t v1754 = vrev64q_f32(v1268); - *(float32x4_t *)v3776 = v1559; - *(float32x4_t *)v3794 = v1560; + vst1q_f32((float32_t *)v3776, v1559); + vst1q_f32((float32_t *)v3794, v1560); float32x4_t v1031 = vsubq_f32(v942, v1028); float32x4_t v1032 = vaddq_f32(v942, v1028); float32x4_t v1115 = vsubq_f32(v944, v1112); @@ -23697,8 +23697,8 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu32(const armral_cmplx_f32_t *restrict x, float32x4_t v1698 = vmulq_f32(v1696, v1767); float32x4_t v1699 = vaddq_f32(v1073, v1689); float32x4_t v1700 = vsubq_f32(v1073, v1689); - *(float32x4_t *)v3704 = v1419; - *(float32x4_t *)v3722 = v1420; + vst1q_f32((float32_t *)v3704, v1419); + vst1q_f32((float32_t *)v3722, v1420); float32x4_t v1346 = vrev64q_f32(v1340); float32x4_t v1349 = vaddq_f32(v1029, v1339); float32x4_t v1350 = vsubq_f32(v1029, v1339); @@ -23713,24 +23713,24 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu32(const armral_cmplx_f32_t *restrict x, float32x4_t v1702 = vaddq_f32(v1074, v1698); float32x4_t v1757 = vfmaq_f32(v1735, v1741, v1742); float32x4_t v1758 = vfmaq_f32(v1748, v1754, v1755); - *(float32x4_t *)v3785 = v1561; - *(float32x4_t *)v3803 = v1562; - *(float32x4_t *)v3848 = v1699; - *(float32x4_t *)v3866 = v1700; + vst1q_f32((float32_t *)v3785, v1561); + vst1q_f32((float32_t *)v3803, v1562); + vst1q_f32((float32_t *)v3848, v1699); + vst1q_f32((float32_t *)v3866, v1700); float32x4_t v1348 = vmulq_f32(v1346, v1767); float32x4_t v1479 = vaddq_f32(v1477, v1478); float32x4_t v1480 = vsubq_f32(v1478, v1477); float32x4_t v1628 = vmulq_f32(v1626, v1767); float32x4_t v1759 = vaddq_f32(v1757, v1758); float32x4_t v1760 = vsubq_f32(v1758, v1757); - *(float32x4_t *)v3668 = v1349; - *(float32x4_t *)v3686 = v1350; - *(float32x4_t *)v3713 = v1421; - *(float32x4_t *)v3731 = v1422; - *(float32x4_t *)v3812 = v1629; - *(float32x4_t *)v3830 = v1630; - *(float32x4_t *)v3857 = v1701; - *(float32x4_t *)v3875 = v1702; + vst1q_f32((float32_t *)v3668, v1349); + vst1q_f32((float32_t *)v3686, v1350); + vst1q_f32((float32_t *)v3713, v1421); + vst1q_f32((float32_t *)v3731, v1422); + vst1q_f32((float32_t *)v3812, v1629); + vst1q_f32((float32_t *)v3830, v1630); + vst1q_f32((float32_t *)v3857, v1701); + vst1q_f32((float32_t *)v3875, v1702); float32x4_t v1351 = vsubq_f32(v1030, v1348); float32x4_t v1352 = vaddq_f32(v1030, v1348); float32x4_t v1486 = vrev64q_f32(v1480); @@ -23743,22 +23743,22 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu32(const armral_cmplx_f32_t *restrict x, float32x4_t v1770 = vsubq_f32(v1115, v1759); float32x4_t v1488 = vmulq_f32(v1486, v1767); float32x4_t v1768 = vmulq_f32(v1766, v1767); - *(float32x4_t *)v3677 = v1351; - *(float32x4_t *)v3695 = v1352; - *(float32x4_t *)v3740 = v1489; - *(float32x4_t *)v3758 = v1490; - *(float32x4_t *)v3821 = v1631; - *(float32x4_t *)v3839 = v1632; - *(float32x4_t *)v3884 = v1769; - *(float32x4_t *)v3902 = v1770; + vst1q_f32((float32_t *)v3677, v1351); + vst1q_f32((float32_t *)v3695, v1352); + vst1q_f32((float32_t *)v3740, v1489); + vst1q_f32((float32_t *)v3758, v1490); + vst1q_f32((float32_t *)v3821, v1631); + vst1q_f32((float32_t *)v3839, v1632); + vst1q_f32((float32_t *)v3884, v1769); + vst1q_f32((float32_t *)v3902, v1770); float32x4_t v1491 = vsubq_f32(v1114, v1488); float32x4_t v1492 = vaddq_f32(v1114, v1488); float32x4_t v1771 = vsubq_f32(v1116, v1768); float32x4_t v1772 = vaddq_f32(v1116, v1768); - *(float32x4_t *)v3749 = v1491; - *(float32x4_t *)v3767 = v1492; - *(float32x4_t *)v3893 = v1771; - *(float32x4_t *)v3911 = v1772; + vst1q_f32((float32_t *)v3749, v1491); + vst1q_f32((float32_t *)v3767, v1492); + vst1q_f32((float32_t *)v3893, v1771); + vst1q_f32((float32_t *)v3911, v1772); v5 += 2 * 1; v6 += 2 * 1; } diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c index 22ab0d6e819066b2c1837c70521b79cf89cf5944..81f601770bd55f12f3302a4321d122fb6642ee40 100644 --- a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c +++ b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c @@ -23,14 +23,14 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu2(const armral_cmplx_f32_t *restrict x, int32_t *v142 = &v6[ostride]; const float32x2_t *v114 = &v5[0]; int32_t *v133 = &v6[0]; - float32x4_t v148 = *(const float32x4_t *)v123; - float32x4_t v146 = *(const float32x4_t *)v114; + float32x4_t v148 = vld1q_f32((const float32_t *)v123); + float32x4_t v146 = vld1q_f32((const float32_t *)v114); float32x4_t v35 = vaddq_f32(v146, v148); float32x4_t v36 = vsubq_f32(v146, v148); int16x4_t v49 = vqmovn_s32(vcvtq_n_s32_f32(v35, 15)); int16x4_t v57 = vqmovn_s32(vcvtq_n_s32_f32(v36, 15)); - *(int16x4_t *)v133 = v49; - *(int16x4_t *)v142 = v57; + vst1_s16((int16_t *)v133, v49); + vst1_s16((int16_t *)v142, v57); v5 += 2 * 1; v6 += 2 * 1; } @@ -117,14 +117,14 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu3(const armral_cmplx_f32_t *restrict x, float32x2_t v57 = (float32x2_t){v55, v56}; const float32x2_t *v181 = &v5[0]; int32_t *v191 = &v6[0]; - float32x4_t v213 = *(const float32x4_t *)v162; + float32x4_t v213 = vld1q_f32((const float32_t *)v162); float32x4_t v53 = vcombine_f32(v52, v52); float32x2_t v59 = vmul_f32(v58, v57); const float32x2_t *v171 = &v5[istride * 2]; int32_t *v209 = &v6[ostride * 2]; - float32x4_t v217 = *(const float32x4_t *)v181; + float32x4_t v217 = vld1q_f32((const float32_t *)v181); float32x4_t v61 = vcombine_f32(v59, v59); - float32x4_t v215 = *(const float32x4_t *)v171; + float32x4_t v215 = vld1q_f32((const float32_t *)v171); float32x4_t v35 = vaddq_f32(v213, v215); float32x4_t v36 = vsubq_f32(v213, v215); float32x4_t v44 = vaddq_f32(v35, v217); @@ -135,11 +135,11 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu3(const armral_cmplx_f32_t *restrict x, int16x4_t v68 = vqmovn_s32(vcvtq_n_s32_f32(v44, 15)); float32x4_t v64 = vaddq_f32(v63, v62); float32x4_t v65 = vsubq_f32(v63, v62); - *(int16x4_t *)v191 = v68; + vst1_s16((int16_t *)v191, v68); int16x4_t v76 = vqmovn_s32(vcvtq_n_s32_f32(v65, 15)); int16x4_t v84 = vqmovn_s32(vcvtq_n_s32_f32(v64, 15)); - *(int16x4_t *)v200 = v76; - *(int16x4_t *)v209 = v84; + vst1_s16((int16_t *)v200, v76); + vst1_s16((int16_t *)v209, v84); v5 += 2 * 1; v6 += 2 * 1; } @@ -270,16 +270,16 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu4(const armral_cmplx_f32_t *restrict x, float32x2_t v72 = (float32x2_t){v70, v71}; const float32x2_t *v202 = &v5[0]; int32_t *v239 = &v6[0]; - float32x4_t v274 = *(const float32x4_t *)v220; + float32x4_t v274 = vld1q_f32((const float32_t *)v220); float32x2_t v74 = vmul_f32(v73, v72); const float32x2_t *v211 = &v5[istride * 2]; const float32x2_t *v229 = &v5[istride * 3]; int32_t *v257 = &v6[ostride * 2]; int32_t *v266 = &v6[ostride * 3]; - float32x4_t v270 = *(const float32x4_t *)v202; + float32x4_t v270 = vld1q_f32((const float32_t *)v202); float32x4_t v76 = vcombine_f32(v74, v74); - float32x4_t v272 = *(const float32x4_t *)v211; - float32x4_t v276 = *(const float32x4_t *)v229; + float32x4_t v272 = vld1q_f32((const float32_t *)v211); + float32x4_t v276 = vld1q_f32((const float32_t *)v229); float32x4_t v35 = vaddq_f32(v270, v272); float32x4_t v36 = vsubq_f32(v270, v272); float32x4_t v51 = vaddq_f32(v274, v276); @@ -292,12 +292,12 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu4(const armral_cmplx_f32_t *restrict x, int16x4_t v98 = vqmovn_s32(vcvtq_n_s32_f32(v54, 15)); float32x4_t v78 = vaddq_f32(v36, v77); float32x4_t v79 = vsubq_f32(v36, v77); - *(int16x4_t *)v239 = v82; - *(int16x4_t *)v257 = v98; + vst1_s16((int16_t *)v239, v82); + vst1_s16((int16_t *)v257, v98); int16x4_t v90 = vqmovn_s32(vcvtq_n_s32_f32(v79, 15)); int16x4_t v106 = vqmovn_s32(vcvtq_n_s32_f32(v78, 15)); - *(int16x4_t *)v248 = v90; - *(int16x4_t *)v266 = v106; + vst1_s16((int16_t *)v248, v90); + vst1_s16((int16_t *)v266, v106); v5 += 2 * 1; v6 += 2 * 1; } @@ -456,7 +456,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu5(const armral_cmplx_f32_t *restrict x, float32x2_t v97 = (float32x2_t){v95, v96}; const float32x2_t *v312 = &v5[0]; int32_t *v322 = &v6[0]; - float32x4_t v362 = *(const float32x4_t *)v275; + float32x4_t v362 = vld1q_f32((const float32_t *)v275); float32x4_t v72 = vcombine_f32(v71, v71); float32x4_t v77 = vcombine_f32(v76, v76); float32x2_t v83 = vmul_f32(v98, v81); @@ -468,13 +468,13 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu5(const armral_cmplx_f32_t *restrict x, int32_t *v340 = &v6[ostride * 2]; int32_t *v349 = &v6[ostride * 3]; int32_t *v358 = &v6[ostride * 4]; - float32x4_t v370 = *(const float32x4_t *)v312; + float32x4_t v370 = vld1q_f32((const float32_t *)v312); float32x4_t v85 = vcombine_f32(v83, v83); float32x4_t v93 = vcombine_f32(v91, v91); float32x4_t v101 = vcombine_f32(v99, v99); - float32x4_t v364 = *(const float32x4_t *)v284; - float32x4_t v366 = *(const float32x4_t *)v293; - float32x4_t v368 = *(const float32x4_t *)v302; + float32x4_t v364 = vld1q_f32((const float32_t *)v284); + float32x4_t v366 = vld1q_f32((const float32_t *)v293); + float32x4_t v368 = vld1q_f32((const float32_t *)v302); float32x4_t v35 = vaddq_f32(v362, v364); float32x4_t v36 = vsubq_f32(v362, v364); float32x4_t v51 = vaddq_f32(v366, v368); @@ -497,7 +497,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu5(const armral_cmplx_f32_t *restrict x, float32x4_t v105 = vsubq_f32(v103, v78); float32x4_t v106 = vsubq_f32(v86, v94); float32x4_t v107 = vaddq_f32(v94, v102); - *(int16x4_t *)v322 = v114; + vst1_s16((int16_t *)v322, v114); float32x4_t v108 = vaddq_f32(v104, v106); float32x4_t v109 = vsubq_f32(v104, v106); float32x4_t v110 = vaddq_f32(v105, v107); @@ -506,10 +506,10 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu5(const armral_cmplx_f32_t *restrict x, int16x4_t v130 = vqmovn_s32(vcvtq_n_s32_f32(v111, 15)); int16x4_t v138 = vqmovn_s32(vcvtq_n_s32_f32(v110, 15)); int16x4_t v146 = vqmovn_s32(vcvtq_n_s32_f32(v108, 15)); - *(int16x4_t *)v331 = v122; - *(int16x4_t *)v340 = v130; - *(int16x4_t *)v349 = v138; - *(int16x4_t *)v358 = v146; + vst1_s16((int16_t *)v331, v122); + vst1_s16((int16_t *)v340, v130); + vst1_s16((int16_t *)v349, v138); + vst1_s16((int16_t *)v358, v146); v5 += 2 * 1; v6 += 2 * 1; } @@ -730,7 +730,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu6(const armral_cmplx_f32_t *restrict x, float32x2_t v108 = (float32x2_t){v106, v107}; const float32x2_t *v298 = &v5[0]; int32_t *v353 = &v6[0]; - float32x4_t v412 = *(const float32x4_t *)v343; + float32x4_t v412 = vld1q_f32((const float32_t *)v343); float32x4_t v104 = vcombine_f32(v103, v103); float32x2_t v110 = vmul_f32(v109, v108); const float32x2_t *v307 = &v5[istride * 3]; @@ -741,12 +741,12 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu6(const armral_cmplx_f32_t *restrict x, int32_t *v371 = &v6[ostride * 4]; int32_t *v389 = &v6[ostride * 2]; int32_t *v398 = &v6[ostride * 5]; - float32x4_t v402 = *(const float32x4_t *)v298; + float32x4_t v402 = vld1q_f32((const float32_t *)v298); float32x4_t v112 = vcombine_f32(v110, v110); - float32x4_t v404 = *(const float32x4_t *)v307; - float32x4_t v406 = *(const float32x4_t *)v316; - float32x4_t v408 = *(const float32x4_t *)v325; - float32x4_t v410 = *(const float32x4_t *)v334; + float32x4_t v404 = vld1q_f32((const float32_t *)v307); + float32x4_t v406 = vld1q_f32((const float32_t *)v316); + float32x4_t v408 = vld1q_f32((const float32_t *)v325); + float32x4_t v410 = vld1q_f32((const float32_t *)v334); float32x4_t v35 = vaddq_f32(v402, v404); float32x4_t v36 = vsubq_f32(v402, v404); float32x4_t v51 = vaddq_f32(v406, v408); @@ -773,16 +773,16 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu6(const armral_cmplx_f32_t *restrict x, float32x4_t v92 = vsubq_f32(v90, v89); float32x4_t v115 = vaddq_f32(v114, v113); float32x4_t v116 = vsubq_f32(v114, v113); - *(int16x4_t *)v353 = v119; - *(int16x4_t *)v362 = v127; + vst1_s16((int16_t *)v353, v119); + vst1_s16((int16_t *)v362, v127); int16x4_t v135 = vqmovn_s32(vcvtq_n_s32_f32(v92, 15)); int16x4_t v143 = vqmovn_s32(vcvtq_n_s32_f32(v116, 15)); int16x4_t v151 = vqmovn_s32(vcvtq_n_s32_f32(v91, 15)); int16x4_t v159 = vqmovn_s32(vcvtq_n_s32_f32(v115, 15)); - *(int16x4_t *)v371 = v135; - *(int16x4_t *)v380 = v143; - *(int16x4_t *)v389 = v151; - *(int16x4_t *)v398 = v159; + vst1_s16((int16_t *)v371, v135); + vst1_s16((int16_t *)v380, v143); + vst1_s16((int16_t *)v389, v151); + vst1_s16((int16_t *)v398, v159); v5 += 2 * 1; v6 += 2 * 1; } @@ -1019,7 +1019,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu7(const armral_cmplx_f32_t *restrict x, float32x2_t v138 = (float32x2_t){v136, v137}; const float32x2_t *v453 = &v5[0]; int32_t *v463 = &v6[0]; - float32x4_t v521 = *(const float32x4_t *)v398; + float32x4_t v521 = vld1q_f32((const float32_t *)v398); float32x4_t v95 = vcombine_f32(v94, v94); float32x4_t v100 = vcombine_f32(v99, v99); float32x4_t v105 = vcombine_f32(v104, v104); @@ -1038,16 +1038,16 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu7(const armral_cmplx_f32_t *restrict x, int32_t *v499 = &v6[ostride * 4]; int32_t *v508 = &v6[ostride * 5]; int32_t *v517 = &v6[ostride * 6]; - float32x4_t v533 = *(const float32x4_t *)v453; + float32x4_t v533 = vld1q_f32((const float32_t *)v453); float32x4_t v118 = vcombine_f32(v116, v116); float32x4_t v126 = vcombine_f32(v124, v124); float32x4_t v134 = vcombine_f32(v132, v132); float32x4_t v142 = vcombine_f32(v140, v140); - float32x4_t v523 = *(const float32x4_t *)v407; - float32x4_t v525 = *(const float32x4_t *)v416; - float32x4_t v527 = *(const float32x4_t *)v425; - float32x4_t v529 = *(const float32x4_t *)v434; - float32x4_t v531 = *(const float32x4_t *)v443; + float32x4_t v523 = vld1q_f32((const float32_t *)v407); + float32x4_t v525 = vld1q_f32((const float32_t *)v416); + float32x4_t v527 = vld1q_f32((const float32_t *)v425); + float32x4_t v529 = vld1q_f32((const float32_t *)v434); + float32x4_t v531 = vld1q_f32((const float32_t *)v443); float32x4_t v35 = vaddq_f32(v521, v523); float32x4_t v36 = vsubq_f32(v521, v523); float32x4_t v51 = vaddq_f32(v525, v527); @@ -1085,7 +1085,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu7(const armral_cmplx_f32_t *restrict x, float32x4_t v151 = vaddq_f32(v119, v127); float32x4_t v153 = vsubq_f32(v119, v127); float32x4_t v155 = vsubq_f32(v119, v135); - *(int16x4_t *)v463 = v165; + vst1_s16((int16_t *)v463, v165); float32x4_t v146 = vaddq_f32(v145, v106); float32x4_t v148 = vsubq_f32(v147, v111); float32x4_t v150 = vaddq_f32(v149, v111); @@ -1104,12 +1104,12 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu7(const armral_cmplx_f32_t *restrict x, int16x4_t v197 = vqmovn_s32(vcvtq_n_s32_f32(v162, 15)); int16x4_t v205 = vqmovn_s32(vcvtq_n_s32_f32(v159, 15)); int16x4_t v213 = vqmovn_s32(vcvtq_n_s32_f32(v157, 15)); - *(int16x4_t *)v472 = v173; - *(int16x4_t *)v481 = v181; - *(int16x4_t *)v490 = v189; - *(int16x4_t *)v499 = v197; - *(int16x4_t *)v508 = v205; - *(int16x4_t *)v517 = v213; + vst1_s16((int16_t *)v472, v173); + vst1_s16((int16_t *)v481, v181); + vst1_s16((int16_t *)v490, v189); + vst1_s16((int16_t *)v499, v197); + vst1_s16((int16_t *)v508, v205); + vst1_s16((int16_t *)v517, v213); v5 += 2 * 1; v6 += 2 * 1; } @@ -1443,7 +1443,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu8(const armral_cmplx_f32_t *restrict x, float32x2_t v139 = (float32x2_t){v138, v138}; const float32x2_t *v390 = &v5[0]; int32_t *v463 = &v6[0]; - float32x4_t v538 = *(const float32x4_t *)v426; + float32x4_t v538 = vld1q_f32((const float32_t *)v426); float32x2_t v125 = vmul_f32(v132, v123); float32x2_t v133 = vmul_f32(v132, v131); float32x4_t v140 = vcombine_f32(v139, v139); @@ -1459,15 +1459,15 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu8(const armral_cmplx_f32_t *restrict x, int32_t *v508 = &v6[ostride * 5]; int32_t *v517 = &v6[ostride * 6]; int32_t *v526 = &v6[ostride * 7]; - float32x4_t v530 = *(const float32x4_t *)v390; + float32x4_t v530 = vld1q_f32((const float32_t *)v390); float32x4_t v127 = vcombine_f32(v125, v125); float32x4_t v135 = vcombine_f32(v133, v133); - float32x4_t v532 = *(const float32x4_t *)v399; - float32x4_t v534 = *(const float32x4_t *)v408; - float32x4_t v536 = *(const float32x4_t *)v417; - float32x4_t v540 = *(const float32x4_t *)v435; - float32x4_t v542 = *(const float32x4_t *)v444; - float32x4_t v544 = *(const float32x4_t *)v453; + float32x4_t v532 = vld1q_f32((const float32_t *)v399); + float32x4_t v534 = vld1q_f32((const float32_t *)v408); + float32x4_t v536 = vld1q_f32((const float32_t *)v417); + float32x4_t v540 = vld1q_f32((const float32_t *)v435); + float32x4_t v542 = vld1q_f32((const float32_t *)v444); + float32x4_t v544 = vld1q_f32((const float32_t *)v453); float32x4_t v35 = vaddq_f32(v530, v532); float32x4_t v36 = vsubq_f32(v530, v532); float32x4_t v51 = vaddq_f32(v534, v536); @@ -1499,8 +1499,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu8(const armral_cmplx_f32_t *restrict x, float32x4_t v143 = vsubq_f32(v86, v115); float32x4_t v146 = vaddq_f32(v128, v136); float32x4_t v147 = vsubq_f32(v128, v136); - *(int16x4_t *)v463 = v154; - *(int16x4_t *)v499 = v186; + vst1_s16((int16_t *)v463, v154); + vst1_s16((int16_t *)v499, v186); float32x4_t v148 = vaddq_f32(v144, v146); float32x4_t v149 = vsubq_f32(v144, v146); float32x4_t v150 = vaddq_f32(v145, v147); @@ -1511,12 +1511,12 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu8(const armral_cmplx_f32_t *restrict x, int16x4_t v178 = vqmovn_s32(vcvtq_n_s32_f32(v150, 15)); int16x4_t v194 = vqmovn_s32(vcvtq_n_s32_f32(v151, 15)); int16x4_t v210 = vqmovn_s32(vcvtq_n_s32_f32(v148, 15)); - *(int16x4_t *)v481 = v170; - *(int16x4_t *)v517 = v202; - *(int16x4_t *)v472 = v162; - *(int16x4_t *)v490 = v178; - *(int16x4_t *)v508 = v194; - *(int16x4_t *)v526 = v210; + vst1_s16((int16_t *)v481, v170); + vst1_s16((int16_t *)v517, v202); + vst1_s16((int16_t *)v472, v162); + vst1_s16((int16_t *)v490, v178); + vst1_s16((int16_t *)v508, v194); + vst1_s16((int16_t *)v526, v210); v5 += 2 * 1; v6 += 2 * 1; } @@ -1821,7 +1821,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu9(const armral_cmplx_f32_t *restrict x, float32x2_t v168 = (float32x2_t){v166, v167}; const float32x2_t *v565 = &v5[0]; int32_t *v575 = &v6[0]; - float32x4_t v651 = *(const float32x4_t *)v492; + float32x4_t v651 = vld1q_f32((const float32_t *)v492); float32x4_t v112 = vcombine_f32(v111, v111); float32x4_t v125 = vcombine_f32(v124, v124); float32x2_t v131 = vmul_f32(v169, v129); @@ -1845,18 +1845,18 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu9(const armral_cmplx_f32_t *restrict x, int32_t *v629 = &v6[ostride * 6]; int32_t *v638 = &v6[ostride * 7]; int32_t *v647 = &v6[ostride * 8]; - float32x4_t v667 = *(const float32x4_t *)v565; + float32x4_t v667 = vld1q_f32((const float32_t *)v565); float32x4_t v133 = vcombine_f32(v131, v131); float32x4_t v156 = vcombine_f32(v154, v154); float32x4_t v164 = vcombine_f32(v162, v162); float32x4_t v172 = vcombine_f32(v170, v170); - float32x4_t v653 = *(const float32x4_t *)v501; - float32x4_t v655 = *(const float32x4_t *)v510; - float32x4_t v657 = *(const float32x4_t *)v519; - float32x4_t v659 = *(const float32x4_t *)v528; - float32x4_t v661 = *(const float32x4_t *)v537; - float32x4_t v663 = *(const float32x4_t *)v546; - float32x4_t v665 = *(const float32x4_t *)v555; + float32x4_t v653 = vld1q_f32((const float32_t *)v501); + float32x4_t v655 = vld1q_f32((const float32_t *)v510); + float32x4_t v657 = vld1q_f32((const float32_t *)v519); + float32x4_t v659 = vld1q_f32((const float32_t *)v528); + float32x4_t v661 = vld1q_f32((const float32_t *)v537); + float32x4_t v663 = vld1q_f32((const float32_t *)v546); + float32x4_t v665 = vld1q_f32((const float32_t *)v555); float32x4_t v35 = vaddq_f32(v651, v653); float32x4_t v36 = vsubq_f32(v651, v653); float32x4_t v51 = vaddq_f32(v655, v657); @@ -1904,7 +1904,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu9(const armral_cmplx_f32_t *restrict x, int16x4_t v201 = vqmovn_s32(vcvtq_n_s32_f32(v95, 15)); float32x4_t v176 = vaddq_f32(v95, v175); float32x4_t v180 = vaddq_f32(v179, v174); - *(int16x4_t *)v575 = v201; + vst1_s16((int16_t *)v575, v201); float32x4_t v177 = vaddq_f32(v176, v121); float32x4_t v178 = vsubq_f32(v176, v121); float32x4_t v181 = vaddq_f32(v180, v139); @@ -1921,20 +1921,20 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu9(const armral_cmplx_f32_t *restrict x, float32x4_t v196 = vsubq_f32(v184, v190); float32x4_t v197 = vaddq_f32(v186, v192); float32x4_t v198 = vsubq_f32(v186, v192); - *(int16x4_t *)v602 = v225; - *(int16x4_t *)v629 = v249; + vst1_s16((int16_t *)v602, v225); + vst1_s16((int16_t *)v629, v249); int16x4_t v209 = vqmovn_s32(vcvtq_n_s32_f32(v194, 15)); int16x4_t v217 = vqmovn_s32(vcvtq_n_s32_f32(v195, 15)); int16x4_t v233 = vqmovn_s32(vcvtq_n_s32_f32(v198, 15)); int16x4_t v241 = vqmovn_s32(vcvtq_n_s32_f32(v197, 15)); int16x4_t v257 = vqmovn_s32(vcvtq_n_s32_f32(v196, 15)); int16x4_t v265 = vqmovn_s32(vcvtq_n_s32_f32(v193, 15)); - *(int16x4_t *)v584 = v209; - *(int16x4_t *)v593 = v217; - *(int16x4_t *)v611 = v233; - *(int16x4_t *)v620 = v241; - *(int16x4_t *)v638 = v257; - *(int16x4_t *)v647 = v265; + vst1_s16((int16_t *)v584, v209); + vst1_s16((int16_t *)v593, v217); + vst1_s16((int16_t *)v611, v233); + vst1_s16((int16_t *)v620, v241); + vst1_s16((int16_t *)v638, v257); + vst1_s16((int16_t *)v647, v265); v5 += 2 * 1; v6 += 2 * 1; } @@ -2344,7 +2344,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu10(const armral_cmplx_f32_t *restrict x, float32x2_t v198 = (float32x2_t){v196, v197}; const float32x2_t *v532 = &v5[0]; int32_t *v623 = &v6[0]; - float32x4_t v722 = *(const float32x4_t *)v595; + float32x4_t v722 = vld1q_f32((const float32_t *)v595); float32x4_t v173 = vcombine_f32(v172, v172); float32x4_t v178 = vcombine_f32(v177, v177); float32x2_t v184 = vmul_f32(v199, v182); @@ -2366,18 +2366,18 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu10(const armral_cmplx_f32_t *restrict x, int32_t *v686 = &v6[ostride * 3]; int32_t *v695 = &v6[ostride * 4]; int32_t *v704 = &v6[ostride * 9]; - float32x4_t v708 = *(const float32x4_t *)v532; + float32x4_t v708 = vld1q_f32((const float32_t *)v532); float32x4_t v186 = vcombine_f32(v184, v184); float32x4_t v194 = vcombine_f32(v192, v192); float32x4_t v202 = vcombine_f32(v200, v200); - float32x4_t v710 = *(const float32x4_t *)v541; - float32x4_t v712 = *(const float32x4_t *)v550; - float32x4_t v714 = *(const float32x4_t *)v559; - float32x4_t v716 = *(const float32x4_t *)v568; - float32x4_t v718 = *(const float32x4_t *)v577; - float32x4_t v720 = *(const float32x4_t *)v586; - float32x4_t v724 = *(const float32x4_t *)v604; - float32x4_t v726 = *(const float32x4_t *)v613; + float32x4_t v710 = vld1q_f32((const float32_t *)v541); + float32x4_t v712 = vld1q_f32((const float32_t *)v550); + float32x4_t v714 = vld1q_f32((const float32_t *)v559); + float32x4_t v716 = vld1q_f32((const float32_t *)v568); + float32x4_t v718 = vld1q_f32((const float32_t *)v577); + float32x4_t v720 = vld1q_f32((const float32_t *)v586); + float32x4_t v724 = vld1q_f32((const float32_t *)v604); + float32x4_t v726 = vld1q_f32((const float32_t *)v613); float32x4_t v35 = vaddq_f32(v708, v710); float32x4_t v36 = vsubq_f32(v708, v710); float32x4_t v51 = vaddq_f32(v712, v714); @@ -2432,8 +2432,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu10(const armral_cmplx_f32_t *restrict x, float32x4_t v206 = vsubq_f32(v204, v179); float32x4_t v207 = vsubq_f32(v187, v195); float32x4_t v208 = vaddq_f32(v195, v203); - *(int16x4_t *)v623 = v215; - *(int16x4_t *)v632 = v223; + vst1_s16((int16_t *)v623, v215); + vst1_s16((int16_t *)v632, v223); float32x4_t v153 = vaddq_f32(v149, v151); float32x4_t v154 = vsubq_f32(v149, v151); float32x4_t v155 = vaddq_f32(v150, v152); @@ -2450,14 +2450,14 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu10(const armral_cmplx_f32_t *restrict x, int16x4_t v271 = vqmovn_s32(vcvtq_n_s32_f32(v211, 15)); int16x4_t v279 = vqmovn_s32(vcvtq_n_s32_f32(v153, 15)); int16x4_t v287 = vqmovn_s32(vcvtq_n_s32_f32(v209, 15)); - *(int16x4_t *)v641 = v231; - *(int16x4_t *)v650 = v239; - *(int16x4_t *)v659 = v247; - *(int16x4_t *)v668 = v255; - *(int16x4_t *)v677 = v263; - *(int16x4_t *)v686 = v271; - *(int16x4_t *)v695 = v279; - *(int16x4_t *)v704 = v287; + vst1_s16((int16_t *)v641, v231); + vst1_s16((int16_t *)v650, v239); + vst1_s16((int16_t *)v659, v247); + vst1_s16((int16_t *)v668, v255); + vst1_s16((int16_t *)v677, v263); + vst1_s16((int16_t *)v686, v271); + vst1_s16((int16_t *)v695, v279); + vst1_s16((int16_t *)v704, v287); v5 += 2 * 1; v6 += 2 * 1; } @@ -2894,7 +2894,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu11(const armral_cmplx_f32_t *restrict x, float32x2_t v264 = (float32x2_t){v262, v263}; const float32x2_t *v833 = &v5[0]; int32_t *v843 = &v6[0]; - float32x4_t v937 = *(const float32x4_t *)v742; + float32x4_t v937 = vld1q_f32((const float32_t *)v742); float32x4_t v143 = vcombine_f32(v142, v142); float32x2_t v149 = vmul_f32(v265, v147); float32x4_t v156 = vcombine_f32(v155, v155); @@ -2933,7 +2933,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu11(const armral_cmplx_f32_t *restrict x, int32_t *v906 = &v6[ostride * 4]; int32_t *v915 = &v6[ostride * 3]; int32_t *v924 = &v6[ostride * 2]; - float32x4_t v957 = *(const float32x4_t *)v833; + float32x4_t v957 = vld1q_f32((const float32_t *)v833); float32x4_t v151 = vcombine_f32(v149, v149); float32x4_t v204 = vcombine_f32(v202, v202); float32x4_t v212 = vcombine_f32(v210, v210); @@ -2944,15 +2944,15 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu11(const armral_cmplx_f32_t *restrict x, float32x4_t v252 = vcombine_f32(v250, v250); float32x4_t v260 = vcombine_f32(v258, v258); float32x4_t v268 = vcombine_f32(v266, v266); - float32x4_t v939 = *(const float32x4_t *)v751; - float32x4_t v941 = *(const float32x4_t *)v760; - float32x4_t v943 = *(const float32x4_t *)v769; - float32x4_t v945 = *(const float32x4_t *)v778; - float32x4_t v947 = *(const float32x4_t *)v787; - float32x4_t v949 = *(const float32x4_t *)v796; - float32x4_t v951 = *(const float32x4_t *)v805; - float32x4_t v953 = *(const float32x4_t *)v814; - float32x4_t v955 = *(const float32x4_t *)v823; + float32x4_t v939 = vld1q_f32((const float32_t *)v751); + float32x4_t v941 = vld1q_f32((const float32_t *)v760); + float32x4_t v943 = vld1q_f32((const float32_t *)v769); + float32x4_t v945 = vld1q_f32((const float32_t *)v778); + float32x4_t v947 = vld1q_f32((const float32_t *)v787); + float32x4_t v949 = vld1q_f32((const float32_t *)v796); + float32x4_t v951 = vld1q_f32((const float32_t *)v805); + float32x4_t v953 = vld1q_f32((const float32_t *)v814); + float32x4_t v955 = vld1q_f32((const float32_t *)v823); float32x4_t v35 = vaddq_f32(v937, v939); float32x4_t v50 = vaddq_f32(v941, v943); float32x4_t v65 = vaddq_f32(v945, v947); @@ -3048,7 +3048,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu11(const armral_cmplx_f32_t *restrict x, float32x4_t v299 = vaddq_f32(v152, v284); float32x4_t v301 = vsubq_f32(v284, v280); float32x4_t v304 = vaddq_f32(v303, v281); - *(int16x4_t *)v843 = v319; + vst1_s16((int16_t *)v843, v319); float32x4_t v288 = vsubq_f32(v287, v277); float32x4_t v290 = vaddq_f32(v289, v278); float32x4_t v292 = vsubq_f32(v291, v278); @@ -3079,16 +3079,16 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu11(const armral_cmplx_f32_t *restrict x, int16x4_t v391 = vqmovn_s32(vcvtq_n_s32_f32(v315, 15)); int16x4_t v327 = vqmovn_s32(vcvtq_n_s32_f32(v307, 15)); int16x4_t v399 = vqmovn_s32(vcvtq_n_s32_f32(v316, 15)); - *(int16x4_t *)v861 = v335; - *(int16x4_t *)v870 = v343; - *(int16x4_t *)v879 = v351; - *(int16x4_t *)v888 = v359; - *(int16x4_t *)v897 = v367; - *(int16x4_t *)v906 = v375; - *(int16x4_t *)v915 = v383; - *(int16x4_t *)v924 = v391; - *(int16x4_t *)v852 = v327; - *(int16x4_t *)v933 = v399; + vst1_s16((int16_t *)v861, v335); + vst1_s16((int16_t *)v870, v343); + vst1_s16((int16_t *)v879, v351); + vst1_s16((int16_t *)v888, v359); + vst1_s16((int16_t *)v897, v367); + vst1_s16((int16_t *)v906, v375); + vst1_s16((int16_t *)v915, v383); + vst1_s16((int16_t *)v924, v391); + vst1_s16((int16_t *)v852, v327); + vst1_s16((int16_t *)v933, v399); v5 += 2 * 1; v6 += 2 * 1; } @@ -3732,7 +3732,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu12(const armral_cmplx_f32_t *restrict x, float32x2_t v211 = (float32x2_t){v210, v210}; const float32x2_t *v604 = &v5[0]; int32_t *v695 = &v6[0]; - float32x4_t v816 = *(const float32x4_t *)v667; + float32x4_t v816 = vld1q_f32((const float32_t *)v667); float32x2_t v142 = vmul_f32(v204, v140); float32x4_t v167 = vcombine_f32(v166, v166); float32x2_t v173 = vmul_f32(v204, v171); @@ -3758,20 +3758,20 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu12(const armral_cmplx_f32_t *restrict x, int32_t *v776 = &v6[ostride * 3]; int32_t *v785 = &v6[ostride * 7]; int32_t *v794 = &v6[ostride * 11]; - float32x4_t v802 = *(const float32x4_t *)v604; + float32x4_t v802 = vld1q_f32((const float32_t *)v604); float32x4_t v144 = vcombine_f32(v142, v142); float32x4_t v175 = vcombine_f32(v173, v173); float32x4_t v207 = vcombine_f32(v205, v205); - float32x4_t v798 = *(const float32x4_t *)v585; - float32x4_t v800 = *(const float32x4_t *)v594; - float32x4_t v804 = *(const float32x4_t *)v613; - float32x4_t v806 = *(const float32x4_t *)v622; - float32x4_t v808 = *(const float32x4_t *)v631; - float32x4_t v810 = *(const float32x4_t *)v640; - float32x4_t v812 = *(const float32x4_t *)v649; - float32x4_t v814 = *(const float32x4_t *)v658; - float32x4_t v818 = *(const float32x4_t *)v676; - float32x4_t v820 = *(const float32x4_t *)v685; + float32x4_t v798 = vld1q_f32((const float32_t *)v585); + float32x4_t v800 = vld1q_f32((const float32_t *)v594); + float32x4_t v804 = vld1q_f32((const float32_t *)v613); + float32x4_t v806 = vld1q_f32((const float32_t *)v622); + float32x4_t v808 = vld1q_f32((const float32_t *)v631); + float32x4_t v810 = vld1q_f32((const float32_t *)v640); + float32x4_t v812 = vld1q_f32((const float32_t *)v649); + float32x4_t v814 = vld1q_f32((const float32_t *)v658); + float32x4_t v818 = vld1q_f32((const float32_t *)v676); + float32x4_t v820 = vld1q_f32((const float32_t *)v685); float32x4_t v35 = vaddq_f32(v798, v800); float32x4_t v36 = vsubq_f32(v798, v800); float32x4_t v59 = vaddq_f32(v804, v806); @@ -3830,8 +3830,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu12(const armral_cmplx_f32_t *restrict x, float32x4_t v218 = vsubq_f32(v216, v192); float32x4_t v271 = vaddq_f32(v270, v200); float32x4_t v272 = vsubq_f32(v270, v200); - *(int16x4_t *)v695 = v221; - *(int16x4_t *)v749 = v275; + vst1_s16((int16_t *)v695, v221); + vst1_s16((int16_t *)v749, v275); int16x4_t v229 = vqmovn_s32(vcvtq_n_s32_f32(v218, 15)); int16x4_t v237 = vqmovn_s32(vcvtq_n_s32_f32(v217, 15)); float32x4_t v243 = vaddq_f32(v147, v178); @@ -3844,20 +3844,20 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu12(const armral_cmplx_f32_t *restrict x, float32x4_t v245 = vsubq_f32(v243, v215); float32x4_t v298 = vaddq_f32(v297, v214); float32x4_t v299 = vsubq_f32(v297, v214); - *(int16x4_t *)v704 = v229; - *(int16x4_t *)v713 = v237; - *(int16x4_t *)v722 = v248; - *(int16x4_t *)v758 = v283; - *(int16x4_t *)v767 = v291; - *(int16x4_t *)v776 = v302; + vst1_s16((int16_t *)v704, v229); + vst1_s16((int16_t *)v713, v237); + vst1_s16((int16_t *)v722, v248); + vst1_s16((int16_t *)v758, v283); + vst1_s16((int16_t *)v767, v291); + vst1_s16((int16_t *)v776, v302); int16x4_t v256 = vqmovn_s32(vcvtq_n_s32_f32(v245, 15)); int16x4_t v264 = vqmovn_s32(vcvtq_n_s32_f32(v244, 15)); int16x4_t v310 = vqmovn_s32(vcvtq_n_s32_f32(v299, 15)); int16x4_t v318 = vqmovn_s32(vcvtq_n_s32_f32(v298, 15)); - *(int16x4_t *)v731 = v256; - *(int16x4_t *)v740 = v264; - *(int16x4_t *)v785 = v310; - *(int16x4_t *)v794 = v318; + vst1_s16((int16_t *)v731, v256); + vst1_s16((int16_t *)v740, v264); + vst1_s16((int16_t *)v785, v310); + vst1_s16((int16_t *)v794, v318); v5 += 2 * 1; v6 += 2 * 1; } @@ -4340,7 +4340,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu13(const armral_cmplx_f32_t *restrict x, float32x2_t v290 = (float32x2_t){v288, v289}; const float32x2_t *v935 = &v5[0]; int32_t *v945 = &v6[0]; - float32x4_t v1057 = *(const float32x4_t *)v826; + float32x4_t v1057 = vld1q_f32((const float32_t *)v826); float32x4_t v163 = vcombine_f32(v162, v162); float32x4_t v168 = vcombine_f32(v167, v167); float32x2_t v174 = vmul_f32(v291, v172); @@ -4383,7 +4383,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu13(const armral_cmplx_f32_t *restrict x, int32_t *v1026 = &v6[ostride * 4]; int32_t *v1035 = &v6[ostride * 3]; int32_t *v1044 = &v6[ostride * 2]; - float32x4_t v1081 = *(const float32x4_t *)v935; + float32x4_t v1081 = vld1q_f32((const float32_t *)v935); float32x4_t v176 = vcombine_f32(v174, v174); float32x4_t v184 = vcombine_f32(v182, v182); float32x4_t v192 = vcombine_f32(v190, v190); @@ -4396,17 +4396,17 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu13(const armral_cmplx_f32_t *restrict x, float32x4_t v278 = vcombine_f32(v276, v276); float32x4_t v286 = vcombine_f32(v284, v284); float32x4_t v294 = vcombine_f32(v292, v292); - float32x4_t v1059 = *(const float32x4_t *)v835; - float32x4_t v1061 = *(const float32x4_t *)v844; - float32x4_t v1063 = *(const float32x4_t *)v853; - float32x4_t v1065 = *(const float32x4_t *)v862; - float32x4_t v1067 = *(const float32x4_t *)v871; - float32x4_t v1069 = *(const float32x4_t *)v880; - float32x4_t v1071 = *(const float32x4_t *)v889; - float32x4_t v1073 = *(const float32x4_t *)v898; - float32x4_t v1075 = *(const float32x4_t *)v907; - float32x4_t v1077 = *(const float32x4_t *)v916; - float32x4_t v1079 = *(const float32x4_t *)v925; + float32x4_t v1059 = vld1q_f32((const float32_t *)v835); + float32x4_t v1061 = vld1q_f32((const float32_t *)v844); + float32x4_t v1063 = vld1q_f32((const float32_t *)v853); + float32x4_t v1065 = vld1q_f32((const float32_t *)v862); + float32x4_t v1067 = vld1q_f32((const float32_t *)v871); + float32x4_t v1069 = vld1q_f32((const float32_t *)v880); + float32x4_t v1071 = vld1q_f32((const float32_t *)v889); + float32x4_t v1073 = vld1q_f32((const float32_t *)v898); + float32x4_t v1075 = vld1q_f32((const float32_t *)v907); + float32x4_t v1077 = vld1q_f32((const float32_t *)v916); + float32x4_t v1079 = vld1q_f32((const float32_t *)v925); float32x4_t v35 = vaddq_f32(v1057, v1059); float32x4_t v50 = vaddq_f32(v1061, v1063); float32x4_t v65 = vaddq_f32(v1065, v1067); @@ -4504,7 +4504,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu13(const armral_cmplx_f32_t *restrict x, float32x4_t v327 = vaddq_f32(v309, v313); float32x4_t v329 = vaddq_f32(v311, v313); float32x4_t v331 = vsubq_f32(v310, v314); - *(int16x4_t *)v945 = v349; + vst1_s16((int16_t *)v945, v349); float32x4_t v304 = vaddq_f32(v303, v218); float32x4_t v306 = vsubq_f32(v305, v223); float32x4_t v308 = vaddq_f32(v307, v223); @@ -4547,18 +4547,18 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu13(const armral_cmplx_f32_t *restrict x, int16x4_t v429 = vqmovn_s32(vcvtq_n_s32_f32(v344, 15)); int16x4_t v437 = vqmovn_s32(vcvtq_n_s32_f32(v345, 15)); int16x4_t v445 = vqmovn_s32(vcvtq_n_s32_f32(v346, 15)); - *(int16x4_t *)v954 = v357; - *(int16x4_t *)v963 = v365; - *(int16x4_t *)v972 = v373; - *(int16x4_t *)v981 = v381; - *(int16x4_t *)v990 = v389; - *(int16x4_t *)v999 = v397; - *(int16x4_t *)v1008 = v405; - *(int16x4_t *)v1017 = v413; - *(int16x4_t *)v1026 = v421; - *(int16x4_t *)v1035 = v429; - *(int16x4_t *)v1044 = v437; - *(int16x4_t *)v1053 = v445; + vst1_s16((int16_t *)v954, v357); + vst1_s16((int16_t *)v963, v365); + vst1_s16((int16_t *)v972, v373); + vst1_s16((int16_t *)v981, v381); + vst1_s16((int16_t *)v990, v389); + vst1_s16((int16_t *)v999, v397); + vst1_s16((int16_t *)v1008, v405); + vst1_s16((int16_t *)v1017, v413); + vst1_s16((int16_t *)v1026, v421); + vst1_s16((int16_t *)v1035, v429); + vst1_s16((int16_t *)v1044, v437); + vst1_s16((int16_t *)v1053, v445); v5 += 2 * 1; v6 += 2 * 1; } @@ -5285,7 +5285,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu14(const armral_cmplx_f32_t *restrict x, float32x2_t v294 = (float32x2_t){v292, v293}; const float32x2_t *v786 = &v5[0]; int32_t *v913 = &v6[0]; - float32x4_t v1052 = *(const float32x4_t *)v867; + float32x4_t v1052 = vld1q_f32((const float32_t *)v867); float32x4_t v251 = vcombine_f32(v250, v250); float32x4_t v256 = vcombine_f32(v255, v255); float32x4_t v261 = vcombine_f32(v260, v260); @@ -5318,23 +5318,23 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu14(const armral_cmplx_f32_t *restrict x, int32_t *v1012 = &v6[ostride * 5]; int32_t *v1021 = &v6[ostride * 6]; int32_t *v1030 = &v6[ostride * 13]; - float32x4_t v1034 = *(const float32x4_t *)v786; + float32x4_t v1034 = vld1q_f32((const float32_t *)v786); float32x4_t v274 = vcombine_f32(v272, v272); float32x4_t v282 = vcombine_f32(v280, v280); float32x4_t v290 = vcombine_f32(v288, v288); float32x4_t v298 = vcombine_f32(v296, v296); - float32x4_t v1036 = *(const float32x4_t *)v795; - float32x4_t v1038 = *(const float32x4_t *)v804; - float32x4_t v1040 = *(const float32x4_t *)v813; - float32x4_t v1042 = *(const float32x4_t *)v822; - float32x4_t v1044 = *(const float32x4_t *)v831; - float32x4_t v1046 = *(const float32x4_t *)v840; - float32x4_t v1048 = *(const float32x4_t *)v849; - float32x4_t v1050 = *(const float32x4_t *)v858; - float32x4_t v1054 = *(const float32x4_t *)v876; - float32x4_t v1056 = *(const float32x4_t *)v885; - float32x4_t v1058 = *(const float32x4_t *)v894; - float32x4_t v1060 = *(const float32x4_t *)v903; + float32x4_t v1036 = vld1q_f32((const float32_t *)v795); + float32x4_t v1038 = vld1q_f32((const float32_t *)v804); + float32x4_t v1040 = vld1q_f32((const float32_t *)v813); + float32x4_t v1042 = vld1q_f32((const float32_t *)v822); + float32x4_t v1044 = vld1q_f32((const float32_t *)v831); + float32x4_t v1046 = vld1q_f32((const float32_t *)v840); + float32x4_t v1048 = vld1q_f32((const float32_t *)v849); + float32x4_t v1050 = vld1q_f32((const float32_t *)v858); + float32x4_t v1054 = vld1q_f32((const float32_t *)v876); + float32x4_t v1056 = vld1q_f32((const float32_t *)v885); + float32x4_t v1058 = vld1q_f32((const float32_t *)v894); + float32x4_t v1060 = vld1q_f32((const float32_t *)v903); float32x4_t v35 = vaddq_f32(v1034, v1036); float32x4_t v36 = vsubq_f32(v1034, v1036); float32x4_t v51 = vaddq_f32(v1038, v1040); @@ -5423,8 +5423,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu14(const armral_cmplx_f32_t *restrict x, float32x4_t v307 = vaddq_f32(v275, v283); float32x4_t v309 = vsubq_f32(v275, v283); float32x4_t v311 = vsubq_f32(v275, v291); - *(int16x4_t *)v913 = v321; - *(int16x4_t *)v922 = v329; + vst1_s16((int16_t *)v913, v321); + vst1_s16((int16_t *)v922, v329); float32x4_t v209 = vaddq_f32(v208, v169); float32x4_t v211 = vsubq_f32(v210, v174); float32x4_t v213 = vaddq_f32(v212, v174); @@ -5461,18 +5461,18 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu14(const armral_cmplx_f32_t *restrict x, int16x4_t v409 = vqmovn_s32(vcvtq_n_s32_f32(v315, 15)); int16x4_t v417 = vqmovn_s32(vcvtq_n_s32_f32(v220, 15)); int16x4_t v425 = vqmovn_s32(vcvtq_n_s32_f32(v313, 15)); - *(int16x4_t *)v931 = v337; - *(int16x4_t *)v940 = v345; - *(int16x4_t *)v949 = v353; - *(int16x4_t *)v958 = v361; - *(int16x4_t *)v967 = v369; - *(int16x4_t *)v976 = v377; - *(int16x4_t *)v985 = v385; - *(int16x4_t *)v994 = v393; - *(int16x4_t *)v1003 = v401; - *(int16x4_t *)v1012 = v409; - *(int16x4_t *)v1021 = v417; - *(int16x4_t *)v1030 = v425; + vst1_s16((int16_t *)v931, v337); + vst1_s16((int16_t *)v940, v345); + vst1_s16((int16_t *)v949, v353); + vst1_s16((int16_t *)v958, v361); + vst1_s16((int16_t *)v967, v369); + vst1_s16((int16_t *)v976, v377); + vst1_s16((int16_t *)v985, v385); + vst1_s16((int16_t *)v994, v393); + vst1_s16((int16_t *)v1003, v401); + vst1_s16((int16_t *)v1012, v409); + vst1_s16((int16_t *)v1021, v417); + vst1_s16((int16_t *)v1030, v425); v5 += 2 * 1; v6 += 2 * 1; } @@ -6121,7 +6121,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu15(const armral_cmplx_f32_t *restrict x, float32x2_t v297 = (float32x2_t){v296, v296}; const float32x2_t *v826 = &v5[0]; int32_t *v944 = &v6[0]; - float32x4_t v1088 = *(const float32x4_t *)v871; + float32x4_t v1088 = vld1q_f32((const float32_t *)v871); float32x4_t v157 = vcombine_f32(v156, v156); float32x4_t v162 = vcombine_f32(v161, v161); float32x2_t v168 = vmul_f32(v280, v166); @@ -6165,7 +6165,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu15(const armral_cmplx_f32_t *restrict x, int32_t *v1052 = &v6[ostride * 9]; int32_t *v1061 = &v6[ostride * 4]; int32_t *v1070 = &v6[ostride * 14]; - float32x4_t v1078 = *(const float32x4_t *)v826; + float32x4_t v1078 = vld1q_f32((const float32_t *)v826); float32x4_t v170 = vcombine_f32(v168, v168); float32x4_t v178 = vcombine_f32(v176, v176); float32x4_t v186 = vcombine_f32(v184, v184); @@ -6175,19 +6175,19 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu15(const armral_cmplx_f32_t *restrict x, float32x4_t v267 = vcombine_f32(v265, v265); float32x4_t v275 = vcombine_f32(v273, v273); float32x4_t v283 = vcombine_f32(v281, v281); - float32x4_t v1074 = *(const float32x4_t *)v807; - float32x4_t v1076 = *(const float32x4_t *)v816; - float32x4_t v1080 = *(const float32x4_t *)v835; - float32x4_t v1082 = *(const float32x4_t *)v844; - float32x4_t v1084 = *(const float32x4_t *)v853; - float32x4_t v1086 = *(const float32x4_t *)v862; - float32x4_t v1090 = *(const float32x4_t *)v880; - float32x4_t v1092 = *(const float32x4_t *)v889; - float32x4_t v1094 = *(const float32x4_t *)v898; - float32x4_t v1096 = *(const float32x4_t *)v907; - float32x4_t v1098 = *(const float32x4_t *)v916; - float32x4_t v1100 = *(const float32x4_t *)v925; - float32x4_t v1102 = *(const float32x4_t *)v934; + float32x4_t v1074 = vld1q_f32((const float32_t *)v807); + float32x4_t v1076 = vld1q_f32((const float32_t *)v816); + float32x4_t v1080 = vld1q_f32((const float32_t *)v835); + float32x4_t v1082 = vld1q_f32((const float32_t *)v844); + float32x4_t v1084 = vld1q_f32((const float32_t *)v853); + float32x4_t v1086 = vld1q_f32((const float32_t *)v862); + float32x4_t v1090 = vld1q_f32((const float32_t *)v880); + float32x4_t v1092 = vld1q_f32((const float32_t *)v889); + float32x4_t v1094 = vld1q_f32((const float32_t *)v898); + float32x4_t v1096 = vld1q_f32((const float32_t *)v907); + float32x4_t v1098 = vld1q_f32((const float32_t *)v916); + float32x4_t v1100 = vld1q_f32((const float32_t *)v925); + float32x4_t v1102 = vld1q_f32((const float32_t *)v934); float32x4_t v35 = vaddq_f32(v1074, v1076); float32x4_t v36 = vsubq_f32(v1074, v1076); float32x4_t v59 = vaddq_f32(v1080, v1082); @@ -6270,7 +6270,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu15(const armral_cmplx_f32_t *restrict x, float32x4_t v300 = vaddq_f32(v268, v276); float32x4_t v310 = vaddq_f32(v309, v268); float32x4_t v311 = vsubq_f32(v309, v268); - *(int16x4_t *)v944 = v314; + vst1_s16((int16_t *)v944, v314); float32x4_t v193 = vaddq_f32(v189, v191); float32x4_t v194 = vsubq_f32(v189, v191); float32x4_t v195 = vaddq_f32(v190, v192); @@ -6295,8 +6295,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu15(const armral_cmplx_f32_t *restrict x, int16x4_t v395 = vqmovn_s32(vcvtq_n_s32_f32(v195, 15)); float32x4_t v417 = vaddq_f32(v193, v249); int16x4_t v422 = vqmovn_s32(vcvtq_n_s32_f32(v193, 15)); - *(int16x4_t *)v953 = v322; - *(int16x4_t *)v962 = v330; + vst1_s16((int16_t *)v953, v322); + vst1_s16((int16_t *)v962, v330); float32x4_t v337 = vaddq_f32(v336, v306); float32x4_t v338 = vsubq_f32(v336, v306); float32x4_t v364 = vaddq_f32(v363, v308); @@ -6305,10 +6305,10 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu15(const armral_cmplx_f32_t *restrict x, float32x4_t v392 = vsubq_f32(v390, v307); float32x4_t v418 = vaddq_f32(v417, v305); float32x4_t v419 = vsubq_f32(v417, v305); - *(int16x4_t *)v971 = v341; - *(int16x4_t *)v998 = v368; - *(int16x4_t *)v1025 = v395; - *(int16x4_t *)v1052 = v422; + vst1_s16((int16_t *)v971, v341); + vst1_s16((int16_t *)v998, v368); + vst1_s16((int16_t *)v1025, v395); + vst1_s16((int16_t *)v1052, v422); int16x4_t v349 = vqmovn_s32(vcvtq_n_s32_f32(v338, 15)); int16x4_t v357 = vqmovn_s32(vcvtq_n_s32_f32(v337, 15)); int16x4_t v376 = vqmovn_s32(vcvtq_n_s32_f32(v365, 15)); @@ -6317,14 +6317,14 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu15(const armral_cmplx_f32_t *restrict x, int16x4_t v411 = vqmovn_s32(vcvtq_n_s32_f32(v391, 15)); int16x4_t v430 = vqmovn_s32(vcvtq_n_s32_f32(v419, 15)); int16x4_t v438 = vqmovn_s32(vcvtq_n_s32_f32(v418, 15)); - *(int16x4_t *)v980 = v349; - *(int16x4_t *)v989 = v357; - *(int16x4_t *)v1007 = v376; - *(int16x4_t *)v1016 = v384; - *(int16x4_t *)v1034 = v403; - *(int16x4_t *)v1043 = v411; - *(int16x4_t *)v1061 = v430; - *(int16x4_t *)v1070 = v438; + vst1_s16((int16_t *)v980, v349); + vst1_s16((int16_t *)v989, v357); + vst1_s16((int16_t *)v1007, v376); + vst1_s16((int16_t *)v1016, v384); + vst1_s16((int16_t *)v1034, v403); + vst1_s16((int16_t *)v1043, v411); + vst1_s16((int16_t *)v1061, v430); + vst1_s16((int16_t *)v1070, v438); v5 += 2 * 1; v6 += 2 * 1; } @@ -7006,7 +7006,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu16(const armral_cmplx_f32_t *restrict x, float32x2_t v284 = (float32x2_t){v283, v283}; const float32x2_t *v834 = &v5[0]; int32_t *v979 = &v6[0]; - float32x4_t v1134 = *(const float32x4_t *)v906; + float32x4_t v1134 = vld1q_f32((const float32_t *)v906); float32x2_t v231 = vmul_f32(v267, v229); float32x2_t v239 = vmul_f32(v267, v237); float32x4_t v246 = vcombine_f32(v245, v245); @@ -7044,26 +7044,26 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu16(const armral_cmplx_f32_t *restrict x, int32_t *v1096 = &v6[ostride * 13]; int32_t *v1105 = &v6[ostride * 14]; int32_t *v1114 = &v6[ostride * 15]; - float32x4_t v1118 = *(const float32x4_t *)v834; + float32x4_t v1118 = vld1q_f32((const float32_t *)v834); float32x4_t v233 = vcombine_f32(v231, v231); float32x4_t v241 = vcombine_f32(v239, v239); float32x4_t v254 = vcombine_f32(v252, v252); float32x4_t v262 = vcombine_f32(v260, v260); float32x4_t v270 = vcombine_f32(v268, v268); - float32x4_t v1120 = *(const float32x4_t *)v843; - float32x4_t v1122 = *(const float32x4_t *)v852; - float32x4_t v1124 = *(const float32x4_t *)v861; - float32x4_t v1126 = *(const float32x4_t *)v870; - float32x4_t v1128 = *(const float32x4_t *)v879; - float32x4_t v1130 = *(const float32x4_t *)v888; - float32x4_t v1132 = *(const float32x4_t *)v897; - float32x4_t v1136 = *(const float32x4_t *)v915; - float32x4_t v1138 = *(const float32x4_t *)v924; - float32x4_t v1140 = *(const float32x4_t *)v933; - float32x4_t v1142 = *(const float32x4_t *)v942; - float32x4_t v1144 = *(const float32x4_t *)v951; - float32x4_t v1146 = *(const float32x4_t *)v960; - float32x4_t v1148 = *(const float32x4_t *)v969; + float32x4_t v1120 = vld1q_f32((const float32_t *)v843); + float32x4_t v1122 = vld1q_f32((const float32_t *)v852); + float32x4_t v1124 = vld1q_f32((const float32_t *)v861); + float32x4_t v1126 = vld1q_f32((const float32_t *)v870); + float32x4_t v1128 = vld1q_f32((const float32_t *)v879); + float32x4_t v1130 = vld1q_f32((const float32_t *)v888); + float32x4_t v1132 = vld1q_f32((const float32_t *)v897); + float32x4_t v1136 = vld1q_f32((const float32_t *)v915); + float32x4_t v1138 = vld1q_f32((const float32_t *)v924); + float32x4_t v1140 = vld1q_f32((const float32_t *)v933); + float32x4_t v1142 = vld1q_f32((const float32_t *)v942); + float32x4_t v1144 = vld1q_f32((const float32_t *)v951); + float32x4_t v1146 = vld1q_f32((const float32_t *)v960); + float32x4_t v1148 = vld1q_f32((const float32_t *)v969); float32x4_t v35 = vaddq_f32(v1118, v1120); float32x4_t v36 = vsubq_f32(v1118, v1120); float32x4_t v51 = vaddq_f32(v1122, v1124); @@ -7151,8 +7151,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu16(const armral_cmplx_f32_t *restrict x, float32x4_t v312 = vsubq_f32(v298, v300); float32x4_t v313 = vaddq_f32(v298, v306); float32x4_t v314 = vsubq_f32(v298, v306); - *(int16x4_t *)v979 = v333; - *(int16x4_t *)v1051 = v397; + vst1_s16((int16_t *)v979, v333); + vst1_s16((int16_t *)v1051, v397); float32x4_t v293 = vaddq_f32(v289, v290); float32x4_t v294 = vaddq_f32(v291, v292); float32x4_t v295 = vsubq_f32(v291, v292); @@ -7177,8 +7177,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu16(const armral_cmplx_f32_t *restrict x, int16x4_t v381 = vqmovn_s32(vcvtq_n_s32_f32(v295, 15)); int16x4_t v413 = vqmovn_s32(vcvtq_n_s32_f32(v294, 15)); int16x4_t v445 = vqmovn_s32(vcvtq_n_s32_f32(v293, 15)); - *(int16x4_t *)v1015 = v365; - *(int16x4_t *)v1087 = v429; + vst1_s16((int16_t *)v1015, v365); + vst1_s16((int16_t *)v1087, v429); int16x4_t v341 = vqmovn_s32(vcvtq_n_s32_f32(v326, 15)); int16x4_t v357 = vqmovn_s32(vcvtq_n_s32_f32(v329, 15)); int16x4_t v373 = vqmovn_s32(vcvtq_n_s32_f32(v330, 15)); @@ -7187,18 +7187,18 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu16(const armral_cmplx_f32_t *restrict x, int16x4_t v421 = vqmovn_s32(vcvtq_n_s32_f32(v327, 15)); int16x4_t v437 = vqmovn_s32(vcvtq_n_s32_f32(v328, 15)); int16x4_t v453 = vqmovn_s32(vcvtq_n_s32_f32(v323, 15)); - *(int16x4_t *)v997 = v349; - *(int16x4_t *)v1033 = v381; - *(int16x4_t *)v1069 = v413; - *(int16x4_t *)v1105 = v445; - *(int16x4_t *)v988 = v341; - *(int16x4_t *)v1006 = v357; - *(int16x4_t *)v1024 = v373; - *(int16x4_t *)v1042 = v389; - *(int16x4_t *)v1060 = v405; - *(int16x4_t *)v1078 = v421; - *(int16x4_t *)v1096 = v437; - *(int16x4_t *)v1114 = v453; + vst1_s16((int16_t *)v997, v349); + vst1_s16((int16_t *)v1033, v381); + vst1_s16((int16_t *)v1069, v413); + vst1_s16((int16_t *)v1105, v445); + vst1_s16((int16_t *)v988, v341); + vst1_s16((int16_t *)v1006, v357); + vst1_s16((int16_t *)v1024, v373); + vst1_s16((int16_t *)v1042, v389); + vst1_s16((int16_t *)v1060, v405); + vst1_s16((int16_t *)v1078, v421); + vst1_s16((int16_t *)v1096, v437); + vst1_s16((int16_t *)v1114, v453); v5 += 2 * 1; v6 += 2 * 1; } @@ -7924,7 +7924,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu17(const armral_cmplx_f32_t *restrict x, float32x2_t v444 = (float32x2_t){v442, v443}; const float32x2_t *v1390 = &v5[0]; int32_t *v1400 = &v6[0]; - float32x4_t v1548 = *(const float32x4_t *)v1245; + float32x4_t v1548 = vld1q_f32((const float32_t *)v1245); float32x4_t v215 = vcombine_f32(v214, v214); float32x4_t v220 = vcombine_f32(v219, v219); float32x4_t v225 = vcombine_f32(v224, v224); @@ -7990,7 +7990,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu17(const armral_cmplx_f32_t *restrict x, int32_t *v1526 = &v6[ostride * 10]; int32_t *v1535 = &v6[ostride * 8]; int32_t *v1544 = &v6[ostride * 9]; - float32x4_t v1580 = *(const float32x4_t *)v1390; + float32x4_t v1580 = vld1q_f32((const float32_t *)v1390); float32x4_t v288 = vcombine_f32(v286, v286); float32x4_t v296 = vcombine_f32(v294, v294); float32x4_t v304 = vcombine_f32(v302, v302); @@ -8012,21 +8012,21 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu17(const armral_cmplx_f32_t *restrict x, float32x4_t v432 = vcombine_f32(v430, v430); float32x4_t v440 = vcombine_f32(v438, v438); float32x4_t v448 = vcombine_f32(v446, v446); - float32x4_t v1550 = *(const float32x4_t *)v1254; - float32x4_t v1552 = *(const float32x4_t *)v1263; - float32x4_t v1554 = *(const float32x4_t *)v1272; - float32x4_t v1556 = *(const float32x4_t *)v1281; - float32x4_t v1558 = *(const float32x4_t *)v1290; - float32x4_t v1560 = *(const float32x4_t *)v1299; - float32x4_t v1562 = *(const float32x4_t *)v1308; - float32x4_t v1564 = *(const float32x4_t *)v1317; - float32x4_t v1566 = *(const float32x4_t *)v1326; - float32x4_t v1568 = *(const float32x4_t *)v1335; - float32x4_t v1570 = *(const float32x4_t *)v1344; - float32x4_t v1572 = *(const float32x4_t *)v1353; - float32x4_t v1574 = *(const float32x4_t *)v1362; - float32x4_t v1576 = *(const float32x4_t *)v1371; - float32x4_t v1578 = *(const float32x4_t *)v1380; + float32x4_t v1550 = vld1q_f32((const float32_t *)v1254); + float32x4_t v1552 = vld1q_f32((const float32_t *)v1263); + float32x4_t v1554 = vld1q_f32((const float32_t *)v1272); + float32x4_t v1556 = vld1q_f32((const float32_t *)v1281); + float32x4_t v1558 = vld1q_f32((const float32_t *)v1290); + float32x4_t v1560 = vld1q_f32((const float32_t *)v1299); + float32x4_t v1562 = vld1q_f32((const float32_t *)v1308); + float32x4_t v1564 = vld1q_f32((const float32_t *)v1317); + float32x4_t v1566 = vld1q_f32((const float32_t *)v1326); + float32x4_t v1568 = vld1q_f32((const float32_t *)v1335); + float32x4_t v1570 = vld1q_f32((const float32_t *)v1344); + float32x4_t v1572 = vld1q_f32((const float32_t *)v1353); + float32x4_t v1574 = vld1q_f32((const float32_t *)v1362); + float32x4_t v1576 = vld1q_f32((const float32_t *)v1371); + float32x4_t v1578 = vld1q_f32((const float32_t *)v1380); float32x4_t v35 = vaddq_f32(v1548, v1550); float32x4_t v36 = vsubq_f32(v1548, v1550); float32x4_t v51 = vaddq_f32(v1552, v1554); @@ -8168,7 +8168,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu17(const armral_cmplx_f32_t *restrict x, float32x4_t v482 = vaddq_f32(v345, v353); float32x4_t v483 = vaddq_f32(v361, v377); float32x4_t v484 = vaddq_f32(v369, v377); - *(int16x4_t *)v1400 = v526; + vst1_s16((int16_t *)v1400, v526); float32x4_t v198 = vaddq_f32(v193, v197); float32x4_t v433 = vmulq_f32(v431, v432); float32x4_t v439 = vrev64q_f32(v197); @@ -8232,8 +8232,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu17(const armral_cmplx_f32_t *restrict x, float32x4_t v559 = vsubq_f32(v471, v510); float32x4_t v658 = vaddq_f32(v475, v521); float32x4_t v667 = vsubq_f32(v475, v521); - *(int16x4_t *)v1445 = v571; - *(int16x4_t *)v1454 = v580; + vst1_s16((int16_t *)v1445, v571); + vst1_s16((int16_t *)v1454, v580); float32x4_t v515 = vaddq_f32(v514, v496); float32x4_t v518 = vaddq_f32(v517, v503); float32x4_t v532 = vaddq_f32(v469, v506); @@ -8256,24 +8256,24 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu17(const armral_cmplx_f32_t *restrict x, int16x4_t v634 = vqmovn_s32(vcvtq_n_s32_f32(v631, 15)); float32x4_t v640 = vaddq_f32(v474, v518); float32x4_t v649 = vsubq_f32(v474, v518); - *(int16x4_t *)v1427 = v553; - *(int16x4_t *)v1436 = v562; - *(int16x4_t *)v1535 = v661; - *(int16x4_t *)v1544 = v670; + vst1_s16((int16_t *)v1427, v553); + vst1_s16((int16_t *)v1436, v562); + vst1_s16((int16_t *)v1535, v661); + vst1_s16((int16_t *)v1544, v670); int16x4_t v589 = vqmovn_s32(vcvtq_n_s32_f32(v586, 15)); int16x4_t v598 = vqmovn_s32(vcvtq_n_s32_f32(v595, 15)); int16x4_t v643 = vqmovn_s32(vcvtq_n_s32_f32(v640, 15)); int16x4_t v652 = vqmovn_s32(vcvtq_n_s32_f32(v649, 15)); - *(int16x4_t *)v1409 = v535; - *(int16x4_t *)v1418 = v544; - *(int16x4_t *)v1481 = v607; - *(int16x4_t *)v1490 = v616; - *(int16x4_t *)v1499 = v625; - *(int16x4_t *)v1508 = v634; - *(int16x4_t *)v1463 = v589; - *(int16x4_t *)v1472 = v598; - *(int16x4_t *)v1517 = v643; - *(int16x4_t *)v1526 = v652; + vst1_s16((int16_t *)v1409, v535); + vst1_s16((int16_t *)v1418, v544); + vst1_s16((int16_t *)v1481, v607); + vst1_s16((int16_t *)v1490, v616); + vst1_s16((int16_t *)v1499, v625); + vst1_s16((int16_t *)v1508, v634); + vst1_s16((int16_t *)v1463, v589); + vst1_s16((int16_t *)v1472, v598); + vst1_s16((int16_t *)v1517, v643); + vst1_s16((int16_t *)v1526, v652); v5 += 2 * 1; v6 += 2 * 1; } @@ -9351,7 +9351,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu18(const armral_cmplx_f32_t *restrict x, float32x2_t v364 = (float32x2_t){v362, v363}; const float32x2_t *v982 = &v5[0]; int32_t *v1145 = &v6[0]; - float32x4_t v1324 = *(const float32x4_t *)v1081; + float32x4_t v1324 = vld1q_f32((const float32_t *)v1081); float32x4_t v308 = vcombine_f32(v307, v307); float32x4_t v321 = vcombine_f32(v320, v320); float32x2_t v327 = vmul_f32(v365, v325); @@ -9393,27 +9393,27 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu18(const armral_cmplx_f32_t *restrict x, int32_t *v1280 = &v6[ostride * 7]; int32_t *v1289 = &v6[ostride * 8]; int32_t *v1298 = &v6[ostride * 17]; - float32x4_t v1302 = *(const float32x4_t *)v982; + float32x4_t v1302 = vld1q_f32((const float32_t *)v982); float32x4_t v329 = vcombine_f32(v327, v327); float32x4_t v352 = vcombine_f32(v350, v350); float32x4_t v360 = vcombine_f32(v358, v358); float32x4_t v368 = vcombine_f32(v366, v366); - float32x4_t v1304 = *(const float32x4_t *)v991; - float32x4_t v1306 = *(const float32x4_t *)v1000; - float32x4_t v1308 = *(const float32x4_t *)v1009; - float32x4_t v1310 = *(const float32x4_t *)v1018; - float32x4_t v1312 = *(const float32x4_t *)v1027; - float32x4_t v1314 = *(const float32x4_t *)v1036; - float32x4_t v1316 = *(const float32x4_t *)v1045; - float32x4_t v1318 = *(const float32x4_t *)v1054; - float32x4_t v1320 = *(const float32x4_t *)v1063; - float32x4_t v1322 = *(const float32x4_t *)v1072; - float32x4_t v1326 = *(const float32x4_t *)v1090; - float32x4_t v1328 = *(const float32x4_t *)v1099; - float32x4_t v1330 = *(const float32x4_t *)v1108; - float32x4_t v1332 = *(const float32x4_t *)v1117; - float32x4_t v1334 = *(const float32x4_t *)v1126; - float32x4_t v1336 = *(const float32x4_t *)v1135; + float32x4_t v1304 = vld1q_f32((const float32_t *)v991); + float32x4_t v1306 = vld1q_f32((const float32_t *)v1000); + float32x4_t v1308 = vld1q_f32((const float32_t *)v1009); + float32x4_t v1310 = vld1q_f32((const float32_t *)v1018); + float32x4_t v1312 = vld1q_f32((const float32_t *)v1027); + float32x4_t v1314 = vld1q_f32((const float32_t *)v1036); + float32x4_t v1316 = vld1q_f32((const float32_t *)v1045); + float32x4_t v1318 = vld1q_f32((const float32_t *)v1054); + float32x4_t v1320 = vld1q_f32((const float32_t *)v1063); + float32x4_t v1322 = vld1q_f32((const float32_t *)v1072); + float32x4_t v1326 = vld1q_f32((const float32_t *)v1090); + float32x4_t v1328 = vld1q_f32((const float32_t *)v1099); + float32x4_t v1330 = vld1q_f32((const float32_t *)v1108); + float32x4_t v1332 = vld1q_f32((const float32_t *)v1117); + float32x4_t v1334 = vld1q_f32((const float32_t *)v1126); + float32x4_t v1336 = vld1q_f32((const float32_t *)v1135); float32x4_t v35 = vaddq_f32(v1302, v1304); float32x4_t v36 = vsubq_f32(v1302, v1304); float32x4_t v51 = vaddq_f32(v1306, v1308); @@ -9526,8 +9526,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu18(const armral_cmplx_f32_t *restrict x, float32x4_t v261 = vaddq_f32(v260, v255); float32x4_t v372 = vaddq_f32(v291, v371); float32x4_t v376 = vaddq_f32(v375, v370); - *(int16x4_t *)v1145 = v397; - *(int16x4_t *)v1154 = v405; + vst1_s16((int16_t *)v1145, v397); + vst1_s16((int16_t *)v1154, v405); float32x4_t v258 = vaddq_f32(v257, v202); float32x4_t v259 = vsubq_f32(v257, v202); float32x4_t v262 = vaddq_f32(v261, v220); @@ -9560,10 +9560,10 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu18(const armral_cmplx_f32_t *restrict x, float32x4_t v392 = vsubq_f32(v380, v386); float32x4_t v393 = vaddq_f32(v382, v388); float32x4_t v394 = vsubq_f32(v382, v388); - *(int16x4_t *)v1199 = v445; - *(int16x4_t *)v1208 = v453; - *(int16x4_t *)v1253 = v493; - *(int16x4_t *)v1262 = v501; + vst1_s16((int16_t *)v1199, v445); + vst1_s16((int16_t *)v1208, v453); + vst1_s16((int16_t *)v1253, v493); + vst1_s16((int16_t *)v1262, v501); int16x4_t v413 = vqmovn_s32(vcvtq_n_s32_f32(v275, 15)); int16x4_t v421 = vqmovn_s32(vcvtq_n_s32_f32(v390, 15)); int16x4_t v429 = vqmovn_s32(vcvtq_n_s32_f32(v276, 15)); @@ -9576,18 +9576,18 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu18(const armral_cmplx_f32_t *restrict x, int16x4_t v517 = vqmovn_s32(vcvtq_n_s32_f32(v392, 15)); int16x4_t v525 = vqmovn_s32(vcvtq_n_s32_f32(v274, 15)); int16x4_t v533 = vqmovn_s32(vcvtq_n_s32_f32(v389, 15)); - *(int16x4_t *)v1163 = v413; - *(int16x4_t *)v1172 = v421; - *(int16x4_t *)v1181 = v429; - *(int16x4_t *)v1190 = v437; - *(int16x4_t *)v1217 = v461; - *(int16x4_t *)v1226 = v469; - *(int16x4_t *)v1235 = v477; - *(int16x4_t *)v1244 = v485; - *(int16x4_t *)v1271 = v509; - *(int16x4_t *)v1280 = v517; - *(int16x4_t *)v1289 = v525; - *(int16x4_t *)v1298 = v533; + vst1_s16((int16_t *)v1163, v413); + vst1_s16((int16_t *)v1172, v421); + vst1_s16((int16_t *)v1181, v429); + vst1_s16((int16_t *)v1190, v437); + vst1_s16((int16_t *)v1217, v461); + vst1_s16((int16_t *)v1226, v469); + vst1_s16((int16_t *)v1235, v477); + vst1_s16((int16_t *)v1244, v485); + vst1_s16((int16_t *)v1271, v509); + vst1_s16((int16_t *)v1280, v517); + vst1_s16((int16_t *)v1289, v525); + vst1_s16((int16_t *)v1298, v533); v5 += 2 * 1; v6 += 2 * 1; } @@ -10436,7 +10436,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu19(const armral_cmplx_f32_t *restrict x, float32x2_t v483 = (float32x2_t){v481, v482}; const float32x2_t *v1533 = &v5[0]; int32_t *v1543 = &v6[0]; - float32x4_t v1709 = *(const float32x4_t *)v1370; + float32x4_t v1709 = vld1q_f32((const float32_t *)v1370); float32x4_t v245 = vcombine_f32(v244, v244); float32x4_t v250 = vcombine_f32(v249, v249); float32x4_t v255 = vcombine_f32(v254, v254); @@ -10509,7 +10509,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu19(const armral_cmplx_f32_t *restrict x, int32_t *v1687 = &v6[ostride * 11]; int32_t *v1696 = &v6[ostride * 9]; int32_t *v1705 = &v6[ostride * 10]; - float32x4_t v1745 = *(const float32x4_t *)v1533; + float32x4_t v1745 = vld1q_f32((const float32_t *)v1533); float32x4_t v343 = vcombine_f32(v341, v341); float32x4_t v351 = vcombine_f32(v349, v349); float32x4_t v359 = vcombine_f32(v357, v357); @@ -10529,23 +10529,23 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu19(const armral_cmplx_f32_t *restrict x, float32x4_t v471 = vcombine_f32(v469, v469); float32x4_t v479 = vcombine_f32(v477, v477); float32x4_t v487 = vcombine_f32(v485, v485); - float32x4_t v1711 = *(const float32x4_t *)v1379; - float32x4_t v1713 = *(const float32x4_t *)v1388; - float32x4_t v1715 = *(const float32x4_t *)v1397; - float32x4_t v1717 = *(const float32x4_t *)v1406; - float32x4_t v1719 = *(const float32x4_t *)v1415; - float32x4_t v1721 = *(const float32x4_t *)v1424; - float32x4_t v1723 = *(const float32x4_t *)v1433; - float32x4_t v1725 = *(const float32x4_t *)v1442; - float32x4_t v1727 = *(const float32x4_t *)v1451; - float32x4_t v1729 = *(const float32x4_t *)v1460; - float32x4_t v1731 = *(const float32x4_t *)v1469; - float32x4_t v1733 = *(const float32x4_t *)v1478; - float32x4_t v1735 = *(const float32x4_t *)v1487; - float32x4_t v1737 = *(const float32x4_t *)v1496; - float32x4_t v1739 = *(const float32x4_t *)v1505; - float32x4_t v1741 = *(const float32x4_t *)v1514; - float32x4_t v1743 = *(const float32x4_t *)v1523; + float32x4_t v1711 = vld1q_f32((const float32_t *)v1379); + float32x4_t v1713 = vld1q_f32((const float32_t *)v1388); + float32x4_t v1715 = vld1q_f32((const float32_t *)v1397); + float32x4_t v1717 = vld1q_f32((const float32_t *)v1406); + float32x4_t v1719 = vld1q_f32((const float32_t *)v1415); + float32x4_t v1721 = vld1q_f32((const float32_t *)v1424); + float32x4_t v1723 = vld1q_f32((const float32_t *)v1433); + float32x4_t v1725 = vld1q_f32((const float32_t *)v1442); + float32x4_t v1727 = vld1q_f32((const float32_t *)v1451); + float32x4_t v1729 = vld1q_f32((const float32_t *)v1460); + float32x4_t v1731 = vld1q_f32((const float32_t *)v1469); + float32x4_t v1733 = vld1q_f32((const float32_t *)v1478); + float32x4_t v1735 = vld1q_f32((const float32_t *)v1487); + float32x4_t v1737 = vld1q_f32((const float32_t *)v1496); + float32x4_t v1739 = vld1q_f32((const float32_t *)v1505); + float32x4_t v1741 = vld1q_f32((const float32_t *)v1514); + float32x4_t v1743 = vld1q_f32((const float32_t *)v1523); float32x4_t v35 = vaddq_f32(v1709, v1711); float32x4_t v36 = vsubq_f32(v1709, v1711); float32x4_t v51 = vaddq_f32(v1713, v1715); @@ -10711,7 +10711,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu19(const armral_cmplx_f32_t *restrict x, float32x4_t v542 = vsubq_f32(v520, v521); float32x4_t v544 = vsubq_f32(v472, v488); float32x4_t v545 = vsubq_f32(v480, v488); - *(int16x4_t *)v1543 = v576; + vst1_s16((int16_t *)v1543, v576); float32x4_t v498 = vsubq_f32(v296, v495); float32x4_t v499 = vaddq_f32(v286, v491); float32x4_t v501 = vaddq_f32(v497, v301); @@ -10795,10 +10795,10 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu19(const armral_cmplx_f32_t *restrict x, float32x4_t v699 = vsubq_f32(v556, v568); int16x4_t v711 = vqmovn_s32(vcvtq_n_s32_f32(v708, 15)); int16x4_t v720 = vqmovn_s32(vcvtq_n_s32_f32(v717, 15)); - *(int16x4_t *)v1570 = v603; - *(int16x4_t *)v1579 = v612; - *(int16x4_t *)v1588 = v621; - *(int16x4_t *)v1597 = v630; + vst1_s16((int16_t *)v1570, v603); + vst1_s16((int16_t *)v1579, v612); + vst1_s16((int16_t *)v1588, v621); + vst1_s16((int16_t *)v1597, v630); float32x4_t v582 = vaddq_f32(v551, v563); float32x4_t v591 = vsubq_f32(v551, v563); int16x4_t v675 = vqmovn_s32(vcvtq_n_s32_f32(v672, 15)); @@ -10807,24 +10807,24 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu19(const armral_cmplx_f32_t *restrict x, int16x4_t v702 = vqmovn_s32(vcvtq_n_s32_f32(v699, 15)); float32x4_t v726 = vaddq_f32(v553, v565); float32x4_t v735 = vsubq_f32(v553, v565); - *(int16x4_t *)v1606 = v639; - *(int16x4_t *)v1615 = v648; - *(int16x4_t *)v1624 = v657; - *(int16x4_t *)v1633 = v666; - *(int16x4_t *)v1678 = v711; - *(int16x4_t *)v1687 = v720; + vst1_s16((int16_t *)v1606, v639); + vst1_s16((int16_t *)v1615, v648); + vst1_s16((int16_t *)v1624, v657); + vst1_s16((int16_t *)v1633, v666); + vst1_s16((int16_t *)v1678, v711); + vst1_s16((int16_t *)v1687, v720); int16x4_t v585 = vqmovn_s32(vcvtq_n_s32_f32(v582, 15)); int16x4_t v594 = vqmovn_s32(vcvtq_n_s32_f32(v591, 15)); int16x4_t v729 = vqmovn_s32(vcvtq_n_s32_f32(v726, 15)); int16x4_t v738 = vqmovn_s32(vcvtq_n_s32_f32(v735, 15)); - *(int16x4_t *)v1642 = v675; - *(int16x4_t *)v1651 = v684; - *(int16x4_t *)v1660 = v693; - *(int16x4_t *)v1669 = v702; - *(int16x4_t *)v1552 = v585; - *(int16x4_t *)v1561 = v594; - *(int16x4_t *)v1696 = v729; - *(int16x4_t *)v1705 = v738; + vst1_s16((int16_t *)v1642, v675); + vst1_s16((int16_t *)v1651, v684); + vst1_s16((int16_t *)v1660, v693); + vst1_s16((int16_t *)v1669, v702); + vst1_s16((int16_t *)v1552, v585); + vst1_s16((int16_t *)v1561, v594); + vst1_s16((int16_t *)v1696, v729); + vst1_s16((int16_t *)v1705, v738); v5 += 2 * 1; v6 += 2 * 1; } @@ -12033,7 +12033,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu20(const armral_cmplx_f32_t *restrict x, float32x2_t v403 = (float32x2_t){v402, v402}; const float32x2_t *v1064 = &v5[0]; int32_t *v1245 = &v6[0]; - float32x4_t v1456 = *(const float32x4_t *)v1226; + float32x4_t v1456 = vld1q_f32((const float32_t *)v1226); float32x4_t v319 = vcombine_f32(v318, v318); float32x4_t v324 = vcombine_f32(v323, v323); float32x2_t v330 = vmul_f32(v386, v328); @@ -12081,31 +12081,31 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu20(const armral_cmplx_f32_t *restrict x, int32_t *v1398 = &v6[ostride * 9]; int32_t *v1407 = &v6[ostride * 14]; int32_t *v1416 = &v6[ostride * 19]; - float32x4_t v1420 = *(const float32x4_t *)v1064; + float32x4_t v1420 = vld1q_f32((const float32_t *)v1064); float32x4_t v332 = vcombine_f32(v330, v330); float32x4_t v340 = vcombine_f32(v338, v338); float32x4_t v348 = vcombine_f32(v346, v346); float32x4_t v373 = vcombine_f32(v371, v371); float32x4_t v381 = vcombine_f32(v379, v379); float32x4_t v389 = vcombine_f32(v387, v387); - float32x4_t v1422 = *(const float32x4_t *)v1073; - float32x4_t v1424 = *(const float32x4_t *)v1082; - float32x4_t v1426 = *(const float32x4_t *)v1091; - float32x4_t v1428 = *(const float32x4_t *)v1100; - float32x4_t v1430 = *(const float32x4_t *)v1109; - float32x4_t v1432 = *(const float32x4_t *)v1118; - float32x4_t v1434 = *(const float32x4_t *)v1127; - float32x4_t v1436 = *(const float32x4_t *)v1136; - float32x4_t v1438 = *(const float32x4_t *)v1145; - float32x4_t v1440 = *(const float32x4_t *)v1154; - float32x4_t v1442 = *(const float32x4_t *)v1163; - float32x4_t v1444 = *(const float32x4_t *)v1172; - float32x4_t v1446 = *(const float32x4_t *)v1181; - float32x4_t v1448 = *(const float32x4_t *)v1190; - float32x4_t v1450 = *(const float32x4_t *)v1199; - float32x4_t v1452 = *(const float32x4_t *)v1208; - float32x4_t v1454 = *(const float32x4_t *)v1217; - float32x4_t v1458 = *(const float32x4_t *)v1235; + float32x4_t v1422 = vld1q_f32((const float32_t *)v1073); + float32x4_t v1424 = vld1q_f32((const float32_t *)v1082); + float32x4_t v1426 = vld1q_f32((const float32_t *)v1091); + float32x4_t v1428 = vld1q_f32((const float32_t *)v1100); + float32x4_t v1430 = vld1q_f32((const float32_t *)v1109); + float32x4_t v1432 = vld1q_f32((const float32_t *)v1118); + float32x4_t v1434 = vld1q_f32((const float32_t *)v1127); + float32x4_t v1436 = vld1q_f32((const float32_t *)v1136); + float32x4_t v1438 = vld1q_f32((const float32_t *)v1145); + float32x4_t v1440 = vld1q_f32((const float32_t *)v1154); + float32x4_t v1442 = vld1q_f32((const float32_t *)v1163); + float32x4_t v1444 = vld1q_f32((const float32_t *)v1172); + float32x4_t v1446 = vld1q_f32((const float32_t *)v1181); + float32x4_t v1448 = vld1q_f32((const float32_t *)v1190); + float32x4_t v1450 = vld1q_f32((const float32_t *)v1199); + float32x4_t v1452 = vld1q_f32((const float32_t *)v1208); + float32x4_t v1454 = vld1q_f32((const float32_t *)v1217); + float32x4_t v1458 = vld1q_f32((const float32_t *)v1235); float32x4_t v35 = vaddq_f32(v1420, v1422); float32x4_t v36 = vsubq_f32(v1420, v1422); float32x4_t v51 = vaddq_f32(v1424, v1426); @@ -12227,8 +12227,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu20(const armral_cmplx_f32_t *restrict x, float32x4_t v406 = vaddq_f32(v374, v382); float32x4_t v415 = vaddq_f32(v310, v374); float32x4_t v416 = vsubq_f32(v310, v374); - *(int16x4_t *)v1245 = v419; - *(int16x4_t *)v1263 = v435; + vst1_s16((int16_t *)v1245, v419); + vst1_s16((int16_t *)v1263, v435); float32x4_t v243 = vaddq_f32(v239, v241); float32x4_t v244 = vsubq_f32(v239, v241); float32x4_t v245 = vaddq_f32(v240, v242); @@ -12253,8 +12253,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu20(const armral_cmplx_f32_t *restrict x, int16x4_t v537 = vqmovn_s32(vcvtq_n_s32_f32(v301, 15)); int16x4_t v555 = vqmovn_s32(vcvtq_n_s32_f32(v243, 15)); int16x4_t v571 = vqmovn_s32(vcvtq_n_s32_f32(v299, 15)); - *(int16x4_t *)v1254 = v427; - *(int16x4_t *)v1272 = v443; + vst1_s16((int16_t *)v1254, v427); + vst1_s16((int16_t *)v1272, v443); float32x4_t v449 = vaddq_f32(v356, v412); float32x4_t v450 = vsubq_f32(v356, v412); float32x4_t v483 = vaddq_f32(v358, v414); @@ -12263,14 +12263,14 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu20(const armral_cmplx_f32_t *restrict x, float32x4_t v518 = vsubq_f32(v357, v413); float32x4_t v551 = vaddq_f32(v355, v411); float32x4_t v552 = vsubq_f32(v355, v411); - *(int16x4_t *)v1281 = v453; - *(int16x4_t *)v1299 = v469; - *(int16x4_t *)v1317 = v487; - *(int16x4_t *)v1335 = v503; - *(int16x4_t *)v1353 = v521; - *(int16x4_t *)v1371 = v537; - *(int16x4_t *)v1389 = v555; - *(int16x4_t *)v1407 = v571; + vst1_s16((int16_t *)v1281, v453); + vst1_s16((int16_t *)v1299, v469); + vst1_s16((int16_t *)v1317, v487); + vst1_s16((int16_t *)v1335, v503); + vst1_s16((int16_t *)v1353, v521); + vst1_s16((int16_t *)v1371, v537); + vst1_s16((int16_t *)v1389, v555); + vst1_s16((int16_t *)v1407, v571); int16x4_t v461 = vqmovn_s32(vcvtq_n_s32_f32(v450, 15)); int16x4_t v477 = vqmovn_s32(vcvtq_n_s32_f32(v449, 15)); int16x4_t v495 = vqmovn_s32(vcvtq_n_s32_f32(v484, 15)); @@ -12279,14 +12279,14 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu20(const armral_cmplx_f32_t *restrict x, int16x4_t v545 = vqmovn_s32(vcvtq_n_s32_f32(v517, 15)); int16x4_t v563 = vqmovn_s32(vcvtq_n_s32_f32(v552, 15)); int16x4_t v579 = vqmovn_s32(vcvtq_n_s32_f32(v551, 15)); - *(int16x4_t *)v1290 = v461; - *(int16x4_t *)v1308 = v477; - *(int16x4_t *)v1326 = v495; - *(int16x4_t *)v1344 = v511; - *(int16x4_t *)v1362 = v529; - *(int16x4_t *)v1380 = v545; - *(int16x4_t *)v1398 = v563; - *(int16x4_t *)v1416 = v579; + vst1_s16((int16_t *)v1290, v461); + vst1_s16((int16_t *)v1308, v477); + vst1_s16((int16_t *)v1326, v495); + vst1_s16((int16_t *)v1344, v511); + vst1_s16((int16_t *)v1362, v529); + vst1_s16((int16_t *)v1380, v545); + vst1_s16((int16_t *)v1398, v563); + vst1_s16((int16_t *)v1416, v579); v5 += 2 * 1; v6 += 2 * 1; } @@ -13141,7 +13141,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu21(const armral_cmplx_f32_t *restrict x, float32x2_t v449 = (float32x2_t){v448, v448}; const float32x2_t *v1225 = &v5[0]; int32_t *v1397 = &v6[0]; - float32x4_t v1611 = *(const float32x4_t *)v1342; + float32x4_t v1611 = vld1q_f32((const float32_t *)v1342); float32x4_t v214 = vcombine_f32(v213, v213); float32x4_t v219 = vcombine_f32(v218, v218); float32x4_t v224 = vcombine_f32(v223, v223); @@ -13206,7 +13206,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu21(const armral_cmplx_f32_t *restrict x, int32_t *v1559 = &v6[ostride * 6]; int32_t *v1568 = &v6[ostride * 13]; int32_t *v1577 = &v6[ostride * 20]; - float32x4_t v1585 = *(const float32x4_t *)v1225; + float32x4_t v1585 = vld1q_f32((const float32_t *)v1225); float32x4_t v237 = vcombine_f32(v235, v235); float32x4_t v245 = vcombine_f32(v243, v243); float32x4_t v253 = vcombine_f32(v251, v251); @@ -13220,25 +13220,25 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu21(const armral_cmplx_f32_t *restrict x, float32x4_t v414 = vcombine_f32(v412, v412); float32x4_t v422 = vcombine_f32(v420, v420); float32x4_t v430 = vcombine_f32(v428, v428); - float32x4_t v1581 = *(const float32x4_t *)v1206; - float32x4_t v1583 = *(const float32x4_t *)v1215; - float32x4_t v1587 = *(const float32x4_t *)v1234; - float32x4_t v1589 = *(const float32x4_t *)v1243; - float32x4_t v1591 = *(const float32x4_t *)v1252; - float32x4_t v1593 = *(const float32x4_t *)v1261; - float32x4_t v1595 = *(const float32x4_t *)v1270; - float32x4_t v1597 = *(const float32x4_t *)v1279; - float32x4_t v1599 = *(const float32x4_t *)v1288; - float32x4_t v1601 = *(const float32x4_t *)v1297; - float32x4_t v1603 = *(const float32x4_t *)v1306; - float32x4_t v1605 = *(const float32x4_t *)v1315; - float32x4_t v1607 = *(const float32x4_t *)v1324; - float32x4_t v1609 = *(const float32x4_t *)v1333; - float32x4_t v1613 = *(const float32x4_t *)v1351; - float32x4_t v1615 = *(const float32x4_t *)v1360; - float32x4_t v1617 = *(const float32x4_t *)v1369; - float32x4_t v1619 = *(const float32x4_t *)v1378; - float32x4_t v1621 = *(const float32x4_t *)v1387; + float32x4_t v1581 = vld1q_f32((const float32_t *)v1206); + float32x4_t v1583 = vld1q_f32((const float32_t *)v1215); + float32x4_t v1587 = vld1q_f32((const float32_t *)v1234); + float32x4_t v1589 = vld1q_f32((const float32_t *)v1243); + float32x4_t v1591 = vld1q_f32((const float32_t *)v1252); + float32x4_t v1593 = vld1q_f32((const float32_t *)v1261); + float32x4_t v1595 = vld1q_f32((const float32_t *)v1270); + float32x4_t v1597 = vld1q_f32((const float32_t *)v1279); + float32x4_t v1599 = vld1q_f32((const float32_t *)v1288); + float32x4_t v1601 = vld1q_f32((const float32_t *)v1297); + float32x4_t v1603 = vld1q_f32((const float32_t *)v1306); + float32x4_t v1605 = vld1q_f32((const float32_t *)v1315); + float32x4_t v1607 = vld1q_f32((const float32_t *)v1324); + float32x4_t v1609 = vld1q_f32((const float32_t *)v1333); + float32x4_t v1613 = vld1q_f32((const float32_t *)v1351); + float32x4_t v1615 = vld1q_f32((const float32_t *)v1360); + float32x4_t v1617 = vld1q_f32((const float32_t *)v1369); + float32x4_t v1619 = vld1q_f32((const float32_t *)v1378); + float32x4_t v1621 = vld1q_f32((const float32_t *)v1387); float32x4_t v35 = vaddq_f32(v1581, v1583); float32x4_t v36 = vsubq_f32(v1581, v1583); float32x4_t v59 = vaddq_f32(v1587, v1589); @@ -13378,7 +13378,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu21(const armral_cmplx_f32_t *restrict x, float32x4_t v452 = vaddq_f32(v399, v407); float32x4_t v472 = vaddq_f32(v471, v399); float32x4_t v473 = vsubq_f32(v471, v399); - *(int16x4_t *)v1397 = v476; + vst1_s16((int16_t *)v1397, v476); float32x4_t v265 = vaddq_f32(v264, v225); float32x4_t v267 = vsubq_f32(v266, v230); float32x4_t v269 = vaddq_f32(v268, v230); @@ -13408,8 +13408,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu21(const armral_cmplx_f32_t *restrict x, float32x4_t v454 = vaddq_f32(v453, v423); float32x4_t v456 = vsubq_f32(v455, v431); float32x4_t v458 = vaddq_f32(v457, v431); - *(int16x4_t *)v1406 = v484; - *(int16x4_t *)v1415 = v492; + vst1_s16((int16_t *)v1406, v484); + vst1_s16((int16_t *)v1415, v492); float32x4_t v465 = vaddq_f32(v454, v460); float32x4_t v466 = vsubq_f32(v454, v460); float32x4_t v467 = vaddq_f32(v456, v462); @@ -13440,12 +13440,12 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu21(const armral_cmplx_f32_t *restrict x, float32x4_t v608 = vsubq_f32(v606, v467); float32x4_t v634 = vaddq_f32(v633, v465); float32x4_t v635 = vsubq_f32(v633, v465); - *(int16x4_t *)v1424 = v503; - *(int16x4_t *)v1451 = v530; - *(int16x4_t *)v1478 = v557; - *(int16x4_t *)v1505 = v584; - *(int16x4_t *)v1532 = v611; - *(int16x4_t *)v1559 = v638; + vst1_s16((int16_t *)v1424, v503); + vst1_s16((int16_t *)v1451, v530); + vst1_s16((int16_t *)v1478, v557); + vst1_s16((int16_t *)v1505, v584); + vst1_s16((int16_t *)v1532, v611); + vst1_s16((int16_t *)v1559, v638); int16x4_t v511 = vqmovn_s32(vcvtq_n_s32_f32(v500, 15)); int16x4_t v519 = vqmovn_s32(vcvtq_n_s32_f32(v499, 15)); int16x4_t v538 = vqmovn_s32(vcvtq_n_s32_f32(v527, 15)); @@ -13458,18 +13458,18 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu21(const armral_cmplx_f32_t *restrict x, int16x4_t v627 = vqmovn_s32(vcvtq_n_s32_f32(v607, 15)); int16x4_t v646 = vqmovn_s32(vcvtq_n_s32_f32(v635, 15)); int16x4_t v654 = vqmovn_s32(vcvtq_n_s32_f32(v634, 15)); - *(int16x4_t *)v1433 = v511; - *(int16x4_t *)v1442 = v519; - *(int16x4_t *)v1460 = v538; - *(int16x4_t *)v1469 = v546; - *(int16x4_t *)v1487 = v565; - *(int16x4_t *)v1496 = v573; - *(int16x4_t *)v1514 = v592; - *(int16x4_t *)v1523 = v600; - *(int16x4_t *)v1541 = v619; - *(int16x4_t *)v1550 = v627; - *(int16x4_t *)v1568 = v646; - *(int16x4_t *)v1577 = v654; + vst1_s16((int16_t *)v1433, v511); + vst1_s16((int16_t *)v1442, v519); + vst1_s16((int16_t *)v1460, v538); + vst1_s16((int16_t *)v1469, v546); + vst1_s16((int16_t *)v1487, v565); + vst1_s16((int16_t *)v1496, v573); + vst1_s16((int16_t *)v1514, v592); + vst1_s16((int16_t *)v1523, v600); + vst1_s16((int16_t *)v1541, v619); + vst1_s16((int16_t *)v1550, v627); + vst1_s16((int16_t *)v1568, v646); + vst1_s16((int16_t *)v1577, v654); v5 += 2 * 1; v6 += 2 * 1; } @@ -14553,7 +14553,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu22(const armral_cmplx_f32_t *restrict x, float32x2_t v582 = (float32x2_t){v580, v581}; const float32x2_t *v1490 = &v5[0]; int32_t *v1689 = &v6[0]; - float32x4_t v1908 = *(const float32x4_t *)v1607; + float32x4_t v1908 = vld1q_f32((const float32_t *)v1607); float32x4_t v461 = vcombine_f32(v460, v460); float32x2_t v467 = vmul_f32(v583, v465); float32x4_t v474 = vcombine_f32(v473, v473); @@ -14614,7 +14614,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu22(const armral_cmplx_f32_t *restrict x, int32_t *v1860 = &v6[ostride * 9]; int32_t *v1869 = &v6[ostride * 10]; int32_t *v1878 = &v6[ostride * 21]; - float32x4_t v1882 = *(const float32x4_t *)v1490; + float32x4_t v1882 = vld1q_f32((const float32_t *)v1490); float32x4_t v469 = vcombine_f32(v467, v467); float32x4_t v522 = vcombine_f32(v520, v520); float32x4_t v530 = vcombine_f32(v528, v528); @@ -14625,26 +14625,26 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu22(const armral_cmplx_f32_t *restrict x, float32x4_t v570 = vcombine_f32(v568, v568); float32x4_t v578 = vcombine_f32(v576, v576); float32x4_t v586 = vcombine_f32(v584, v584); - float32x4_t v1884 = *(const float32x4_t *)v1499; - float32x4_t v1886 = *(const float32x4_t *)v1508; - float32x4_t v1888 = *(const float32x4_t *)v1517; - float32x4_t v1890 = *(const float32x4_t *)v1526; - float32x4_t v1892 = *(const float32x4_t *)v1535; - float32x4_t v1894 = *(const float32x4_t *)v1544; - float32x4_t v1896 = *(const float32x4_t *)v1553; - float32x4_t v1898 = *(const float32x4_t *)v1562; - float32x4_t v1900 = *(const float32x4_t *)v1571; - float32x4_t v1902 = *(const float32x4_t *)v1580; - float32x4_t v1904 = *(const float32x4_t *)v1589; - float32x4_t v1906 = *(const float32x4_t *)v1598; - float32x4_t v1910 = *(const float32x4_t *)v1616; - float32x4_t v1912 = *(const float32x4_t *)v1625; - float32x4_t v1914 = *(const float32x4_t *)v1634; - float32x4_t v1916 = *(const float32x4_t *)v1643; - float32x4_t v1918 = *(const float32x4_t *)v1652; - float32x4_t v1920 = *(const float32x4_t *)v1661; - float32x4_t v1922 = *(const float32x4_t *)v1670; - float32x4_t v1924 = *(const float32x4_t *)v1679; + float32x4_t v1884 = vld1q_f32((const float32_t *)v1499); + float32x4_t v1886 = vld1q_f32((const float32_t *)v1508); + float32x4_t v1888 = vld1q_f32((const float32_t *)v1517); + float32x4_t v1890 = vld1q_f32((const float32_t *)v1526); + float32x4_t v1892 = vld1q_f32((const float32_t *)v1535); + float32x4_t v1894 = vld1q_f32((const float32_t *)v1544); + float32x4_t v1896 = vld1q_f32((const float32_t *)v1553); + float32x4_t v1898 = vld1q_f32((const float32_t *)v1562); + float32x4_t v1900 = vld1q_f32((const float32_t *)v1571); + float32x4_t v1902 = vld1q_f32((const float32_t *)v1580); + float32x4_t v1904 = vld1q_f32((const float32_t *)v1589); + float32x4_t v1906 = vld1q_f32((const float32_t *)v1598); + float32x4_t v1910 = vld1q_f32((const float32_t *)v1616); + float32x4_t v1912 = vld1q_f32((const float32_t *)v1625); + float32x4_t v1914 = vld1q_f32((const float32_t *)v1634); + float32x4_t v1916 = vld1q_f32((const float32_t *)v1643); + float32x4_t v1918 = vld1q_f32((const float32_t *)v1652); + float32x4_t v1920 = vld1q_f32((const float32_t *)v1661); + float32x4_t v1922 = vld1q_f32((const float32_t *)v1670); + float32x4_t v1924 = vld1q_f32((const float32_t *)v1679); float32x4_t v35 = vaddq_f32(v1882, v1884); float32x4_t v36 = vsubq_f32(v1882, v1884); float32x4_t v51 = vaddq_f32(v1886, v1888); @@ -14857,8 +14857,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu22(const armral_cmplx_f32_t *restrict x, float32x4_t v617 = vaddq_f32(v470, v602); float32x4_t v619 = vsubq_f32(v602, v598); float32x4_t v622 = vaddq_f32(v621, v599); - *(int16x4_t *)v1689 = v637; - *(int16x4_t *)v1698 = v645; + vst1_s16((int16_t *)v1689, v637); + vst1_s16((int16_t *)v1698, v645); float32x4_t v387 = vsubq_f32(v386, v376); float32x4_t v389 = vaddq_f32(v388, v377); float32x4_t v391 = vsubq_f32(v390, v377); @@ -14919,26 +14919,26 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu22(const armral_cmplx_f32_t *restrict x, int16x4_t v661 = vqmovn_s32(vcvtq_n_s32_f32(v634, 15)); int16x4_t v797 = vqmovn_s32(vcvtq_n_s32_f32(v406, 15)); int16x4_t v805 = vqmovn_s32(vcvtq_n_s32_f32(v625, 15)); - *(int16x4_t *)v1725 = v669; - *(int16x4_t *)v1734 = v677; - *(int16x4_t *)v1743 = v685; - *(int16x4_t *)v1752 = v693; - *(int16x4_t *)v1761 = v701; - *(int16x4_t *)v1770 = v709; - *(int16x4_t *)v1779 = v717; - *(int16x4_t *)v1788 = v725; - *(int16x4_t *)v1797 = v733; - *(int16x4_t *)v1806 = v741; - *(int16x4_t *)v1815 = v749; - *(int16x4_t *)v1824 = v757; - *(int16x4_t *)v1833 = v765; - *(int16x4_t *)v1842 = v773; - *(int16x4_t *)v1851 = v781; - *(int16x4_t *)v1860 = v789; - *(int16x4_t *)v1707 = v653; - *(int16x4_t *)v1716 = v661; - *(int16x4_t *)v1869 = v797; - *(int16x4_t *)v1878 = v805; + vst1_s16((int16_t *)v1725, v669); + vst1_s16((int16_t *)v1734, v677); + vst1_s16((int16_t *)v1743, v685); + vst1_s16((int16_t *)v1752, v693); + vst1_s16((int16_t *)v1761, v701); + vst1_s16((int16_t *)v1770, v709); + vst1_s16((int16_t *)v1779, v717); + vst1_s16((int16_t *)v1788, v725); + vst1_s16((int16_t *)v1797, v733); + vst1_s16((int16_t *)v1806, v741); + vst1_s16((int16_t *)v1815, v749); + vst1_s16((int16_t *)v1824, v757); + vst1_s16((int16_t *)v1833, v765); + vst1_s16((int16_t *)v1842, v773); + vst1_s16((int16_t *)v1851, v781); + vst1_s16((int16_t *)v1860, v789); + vst1_s16((int16_t *)v1707, v653); + vst1_s16((int16_t *)v1716, v661); + vst1_s16((int16_t *)v1869, v797); + vst1_s16((int16_t *)v1878, v805); v5 += 2 * 1; v6 += 2 * 1; } @@ -16137,7 +16137,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu24(const armral_cmplx_f32_t *restrict x, float32x2_t v428 = (float32x2_t){v426, v427}; const float32x2_t *v1216 = &v5[0]; int32_t *v1415 = &v6[0]; - float32x4_t v1646 = *(const float32x4_t *)v1288; + float32x4_t v1646 = vld1q_f32((const float32_t *)v1288); float32x2_t v261 = vmul_f32(v429, v259); float32x2_t v269 = vmul_f32(v429, v267); float32x4_t v276 = vcombine_f32(v275, v275); @@ -16193,35 +16193,35 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu24(const armral_cmplx_f32_t *restrict x, int32_t *v1604 = &v6[ostride * 15]; int32_t *v1613 = &v6[ostride * 7]; int32_t *v1622 = &v6[ostride * 23]; - float32x4_t v1630 = *(const float32x4_t *)v1216; + float32x4_t v1630 = vld1q_f32((const float32_t *)v1216); float32x4_t v263 = vcombine_f32(v261, v261); float32x4_t v271 = vcombine_f32(v269, v269); float32x4_t v338 = vcombine_f32(v336, v336); float32x4_t v346 = vcombine_f32(v344, v344); float32x4_t v414 = vcombine_f32(v412, v412); float32x4_t v432 = vcombine_f32(v430, v430); - float32x4_t v1626 = *(const float32x4_t *)v1197; - float32x4_t v1628 = *(const float32x4_t *)v1206; - float32x4_t v1632 = *(const float32x4_t *)v1225; - float32x4_t v1634 = *(const float32x4_t *)v1234; - float32x4_t v1636 = *(const float32x4_t *)v1243; - float32x4_t v1638 = *(const float32x4_t *)v1252; - float32x4_t v1640 = *(const float32x4_t *)v1261; - float32x4_t v1642 = *(const float32x4_t *)v1270; - float32x4_t v1644 = *(const float32x4_t *)v1279; - float32x4_t v1648 = *(const float32x4_t *)v1297; - float32x4_t v1650 = *(const float32x4_t *)v1306; - float32x4_t v1652 = *(const float32x4_t *)v1315; - float32x4_t v1654 = *(const float32x4_t *)v1324; - float32x4_t v1656 = *(const float32x4_t *)v1333; - float32x4_t v1658 = *(const float32x4_t *)v1342; - float32x4_t v1660 = *(const float32x4_t *)v1351; - float32x4_t v1662 = *(const float32x4_t *)v1360; - float32x4_t v1664 = *(const float32x4_t *)v1369; - float32x4_t v1666 = *(const float32x4_t *)v1378; - float32x4_t v1668 = *(const float32x4_t *)v1387; - float32x4_t v1670 = *(const float32x4_t *)v1396; - float32x4_t v1672 = *(const float32x4_t *)v1405; + float32x4_t v1626 = vld1q_f32((const float32_t *)v1197); + float32x4_t v1628 = vld1q_f32((const float32_t *)v1206); + float32x4_t v1632 = vld1q_f32((const float32_t *)v1225); + float32x4_t v1634 = vld1q_f32((const float32_t *)v1234); + float32x4_t v1636 = vld1q_f32((const float32_t *)v1243); + float32x4_t v1638 = vld1q_f32((const float32_t *)v1252); + float32x4_t v1640 = vld1q_f32((const float32_t *)v1261); + float32x4_t v1642 = vld1q_f32((const float32_t *)v1270); + float32x4_t v1644 = vld1q_f32((const float32_t *)v1279); + float32x4_t v1648 = vld1q_f32((const float32_t *)v1297); + float32x4_t v1650 = vld1q_f32((const float32_t *)v1306); + float32x4_t v1652 = vld1q_f32((const float32_t *)v1315); + float32x4_t v1654 = vld1q_f32((const float32_t *)v1324); + float32x4_t v1656 = vld1q_f32((const float32_t *)v1333); + float32x4_t v1658 = vld1q_f32((const float32_t *)v1342); + float32x4_t v1660 = vld1q_f32((const float32_t *)v1351); + float32x4_t v1662 = vld1q_f32((const float32_t *)v1360); + float32x4_t v1664 = vld1q_f32((const float32_t *)v1369); + float32x4_t v1666 = vld1q_f32((const float32_t *)v1378); + float32x4_t v1668 = vld1q_f32((const float32_t *)v1387); + float32x4_t v1670 = vld1q_f32((const float32_t *)v1396); + float32x4_t v1672 = vld1q_f32((const float32_t *)v1405); float32x4_t v35 = vaddq_f32(v1626, v1628); float32x4_t v36 = vsubq_f32(v1626, v1628); float32x4_t v59 = vaddq_f32(v1632, v1634); @@ -16359,8 +16359,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu24(const armral_cmplx_f32_t *restrict x, float32x4_t v446 = vsubq_f32(v444, v386); float32x4_t v553 = vaddq_f32(v552, v394); float32x4_t v554 = vsubq_f32(v552, v394); - *(int16x4_t *)v1415 = v449; - *(int16x4_t *)v1523 = v557; + vst1_s16((int16_t *)v1415, v449); + vst1_s16((int16_t *)v1523, v557); float32x4_t v284 = vaddq_f32(v280, v282); float32x4_t v285 = vsubq_f32(v280, v282); float32x4_t v286 = vaddq_f32(v281, v283); @@ -16385,12 +16385,12 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu24(const armral_cmplx_f32_t *restrict x, float32x4_t v608 = vsubq_f32(v606, v434); float32x4_t v633 = vaddq_f32(v284, v359); int16x4_t v638 = vqmovn_s32(vcvtq_n_s32_f32(v284, 15)); - *(int16x4_t *)v1424 = v457; - *(int16x4_t *)v1433 = v465; - *(int16x4_t *)v1469 = v503; - *(int16x4_t *)v1532 = v565; - *(int16x4_t *)v1541 = v573; - *(int16x4_t *)v1577 = v611; + vst1_s16((int16_t *)v1424, v457); + vst1_s16((int16_t *)v1433, v465); + vst1_s16((int16_t *)v1469, v503); + vst1_s16((int16_t *)v1532, v565); + vst1_s16((int16_t *)v1541, v573); + vst1_s16((int16_t *)v1577, v611); float32x4_t v472 = vaddq_f32(v471, v441); float32x4_t v473 = vsubq_f32(v471, v441); int16x4_t v511 = vqmovn_s32(vcvtq_n_s32_f32(v500, 15)); @@ -16403,10 +16403,10 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu24(const armral_cmplx_f32_t *restrict x, int16x4_t v627 = vqmovn_s32(vcvtq_n_s32_f32(v607, 15)); float32x4_t v634 = vaddq_f32(v633, v440); float32x4_t v635 = vsubq_f32(v633, v440); - *(int16x4_t *)v1442 = v476; - *(int16x4_t *)v1496 = v530; - *(int16x4_t *)v1550 = v584; - *(int16x4_t *)v1604 = v638; + vst1_s16((int16_t *)v1442, v476); + vst1_s16((int16_t *)v1496, v530); + vst1_s16((int16_t *)v1550, v584); + vst1_s16((int16_t *)v1604, v638); int16x4_t v484 = vqmovn_s32(vcvtq_n_s32_f32(v473, 15)); int16x4_t v492 = vqmovn_s32(vcvtq_n_s32_f32(v472, 15)); int16x4_t v538 = vqmovn_s32(vcvtq_n_s32_f32(v527, 15)); @@ -16415,18 +16415,18 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu24(const armral_cmplx_f32_t *restrict x, int16x4_t v600 = vqmovn_s32(vcvtq_n_s32_f32(v580, 15)); int16x4_t v646 = vqmovn_s32(vcvtq_n_s32_f32(v635, 15)); int16x4_t v654 = vqmovn_s32(vcvtq_n_s32_f32(v634, 15)); - *(int16x4_t *)v1478 = v511; - *(int16x4_t *)v1487 = v519; - *(int16x4_t *)v1586 = v619; - *(int16x4_t *)v1595 = v627; - *(int16x4_t *)v1451 = v484; - *(int16x4_t *)v1460 = v492; - *(int16x4_t *)v1505 = v538; - *(int16x4_t *)v1514 = v546; - *(int16x4_t *)v1559 = v592; - *(int16x4_t *)v1568 = v600; - *(int16x4_t *)v1613 = v646; - *(int16x4_t *)v1622 = v654; + vst1_s16((int16_t *)v1478, v511); + vst1_s16((int16_t *)v1487, v519); + vst1_s16((int16_t *)v1586, v619); + vst1_s16((int16_t *)v1595, v627); + vst1_s16((int16_t *)v1451, v484); + vst1_s16((int16_t *)v1460, v492); + vst1_s16((int16_t *)v1505, v538); + vst1_s16((int16_t *)v1514, v546); + vst1_s16((int16_t *)v1559, v592); + vst1_s16((int16_t *)v1568, v600); + vst1_s16((int16_t *)v1613, v646); + vst1_s16((int16_t *)v1622, v654); v5 += 2 * 1; v6 += 2 * 1; } @@ -17399,7 +17399,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1714 = (float32x2_t){v1713, v1713}; const float32x2_t *v3160 = &v5[0]; int32_t *v3386 = &v6[0]; - float32x4_t v3616 = *(const float32x4_t *)v3205; + float32x4_t v3616 = vld1q_f32((const float32_t *)v3205); float32x2_t v917 = (float32x2_t){v916, v919}; float32x4_t v1037 = vcombine_f32(v1036, v1036); float32x2_t v1043 = vmul_f32(v1684, v1041); @@ -17469,7 +17469,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu25(const armral_cmplx_f32_t *restrict x, int32_t *v3584 = &v6[ostride * 14]; int32_t *v3593 = &v6[ostride * 19]; int32_t *v3602 = &v6[ostride * 24]; - float32x4_t v3606 = *(const float32x4_t *)v3160; + float32x4_t v3606 = vld1q_f32((const float32_t *)v3160); float32x4_t v921 = vcombine_f32(v917, v917); float32x4_t v1045 = vcombine_f32(v1043, v1043); float32x4_t v1218 = vcombine_f32(v1216, v1216); @@ -17481,29 +17481,29 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1592 = vcombine_f32(v1590, v1590); float32x4_t v1613 = vcombine_f32(v1611, v1611); float32x4_t v1687 = vcombine_f32(v1685, v1685); - float32x4_t v3608 = *(const float32x4_t *)v3169; - float32x4_t v3610 = *(const float32x4_t *)v3178; - float32x4_t v3612 = *(const float32x4_t *)v3187; - float32x4_t v3614 = *(const float32x4_t *)v3196; - float32x4_t v3618 = *(const float32x4_t *)v3214; - float32x4_t v3620 = *(const float32x4_t *)v3223; - float32x4_t v3622 = *(const float32x4_t *)v3232; - float32x4_t v3624 = *(const float32x4_t *)v3241; - float32x4_t v3626 = *(const float32x4_t *)v3250; - float32x4_t v3628 = *(const float32x4_t *)v3259; - float32x4_t v3630 = *(const float32x4_t *)v3268; - float32x4_t v3632 = *(const float32x4_t *)v3277; - float32x4_t v3634 = *(const float32x4_t *)v3286; - float32x4_t v3636 = *(const float32x4_t *)v3295; - float32x4_t v3638 = *(const float32x4_t *)v3304; - float32x4_t v3640 = *(const float32x4_t *)v3313; - float32x4_t v3642 = *(const float32x4_t *)v3322; - float32x4_t v3644 = *(const float32x4_t *)v3331; - float32x4_t v3646 = *(const float32x4_t *)v3340; - float32x4_t v3648 = *(const float32x4_t *)v3349; - float32x4_t v3650 = *(const float32x4_t *)v3358; - float32x4_t v3652 = *(const float32x4_t *)v3367; - float32x4_t v3654 = *(const float32x4_t *)v3376; + float32x4_t v3608 = vld1q_f32((const float32_t *)v3169); + float32x4_t v3610 = vld1q_f32((const float32_t *)v3178); + float32x4_t v3612 = vld1q_f32((const float32_t *)v3187); + float32x4_t v3614 = vld1q_f32((const float32_t *)v3196); + float32x4_t v3618 = vld1q_f32((const float32_t *)v3214); + float32x4_t v3620 = vld1q_f32((const float32_t *)v3223); + float32x4_t v3622 = vld1q_f32((const float32_t *)v3232); + float32x4_t v3624 = vld1q_f32((const float32_t *)v3241); + float32x4_t v3626 = vld1q_f32((const float32_t *)v3250); + float32x4_t v3628 = vld1q_f32((const float32_t *)v3259); + float32x4_t v3630 = vld1q_f32((const float32_t *)v3268); + float32x4_t v3632 = vld1q_f32((const float32_t *)v3277); + float32x4_t v3634 = vld1q_f32((const float32_t *)v3286); + float32x4_t v3636 = vld1q_f32((const float32_t *)v3295); + float32x4_t v3638 = vld1q_f32((const float32_t *)v3304); + float32x4_t v3640 = vld1q_f32((const float32_t *)v3313); + float32x4_t v3642 = vld1q_f32((const float32_t *)v3322); + float32x4_t v3644 = vld1q_f32((const float32_t *)v3331); + float32x4_t v3646 = vld1q_f32((const float32_t *)v3340); + float32x4_t v3648 = vld1q_f32((const float32_t *)v3349); + float32x4_t v3650 = vld1q_f32((const float32_t *)v3358); + float32x4_t v3652 = vld1q_f32((const float32_t *)v3367); + float32x4_t v3654 = vld1q_f32((const float32_t *)v3376); float32x4_t v66 = vrev64q_f32(v3608); float32x4_t v80 = vrev64q_f32(v3610); float32x4_t v94 = vrev64q_f32(v3614); @@ -17824,7 +17824,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1600 = vmulq_f32(v1566, v1715); float32x4_t v1616 = vsubq_f32(v1580, v1615); float32x4_t v1621 = vmulq_f32(v1580, v1715); - *(int16x4_t *)v3386 = v966; + vst1_s16((int16_t *)v3386, v966); float32x4_t v956 = vsubq_f32(v955, v950); float32x4_t v997 = vsubq_f32(v950, v996); float32x4_t v1010 = vmulq_f32(v950, v1715); @@ -17885,8 +17885,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1655 = vaddq_f32(v188, v1623); float32x4_t v1669 = vrev64q_f32(v1636); float32x4_t v1686 = vrev64q_f32(v1654); - *(int16x4_t *)v3404 = v1000; - *(int16x4_t *)v3476 = v1312; + vst1_s16((int16_t *)v3404, v1000); + vst1_s16((int16_t *)v3476, v1312); int16x4_t v1028 = vqmovn_s32(vcvtq_n_s32_f32(v1025, 15)); float32x4_t v1123 = vsubq_f32(v1111, v1122); float32x4_t v1128 = vmulq_f32(v1111, v1715); @@ -17899,10 +17899,10 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu25(const armral_cmplx_f32_t *restrict x, int16x4_t v1658 = vqmovn_s32(vcvtq_n_s32_f32(v1655, 15)); float32x4_t v1671 = vmulq_f32(v1669, v1687); float32x4_t v1688 = vmulq_f32(v1686, v1687); - *(int16x4_t *)v3395 = v983; - *(int16x4_t *)v3413 = v1014; - *(int16x4_t *)v3431 = v1139; - *(int16x4_t *)v3521 = v1485; + vst1_s16((int16_t *)v3395, v983); + vst1_s16((int16_t *)v3413, v1014); + vst1_s16((int16_t *)v3431, v1139); + vst1_s16((int16_t *)v3521, v1485); float32x4_t v1129 = vsubq_f32(v1128, v1123); float32x4_t v1170 = vsubq_f32(v1123, v1169); float32x4_t v1183 = vmulq_f32(v1123, v1715); @@ -17915,8 +17915,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1529 = vmulq_f32(v1469, v1715); float32x4_t v1642 = vsubq_f32(v1630, v1641); float32x4_t v1647 = vmulq_f32(v1630, v1715); - *(int16x4_t *)v3422 = v1028; - *(int16x4_t *)v3566 = v1658; + vst1_s16((int16_t *)v3422, v1028); + vst1_s16((int16_t *)v3566, v1658); float32x4_t v1153 = vsubq_f32(v1129, v1152); int16x4_t v1173 = vqmovn_s32(vcvtq_n_s32_f32(v1170, 15)); float32x4_t v1184 = vsubq_f32(v1183, v1170); @@ -17931,7 +17931,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1648 = vsubq_f32(v1647, v1642); float32x4_t v1689 = vsubq_f32(v1642, v1688); float32x4_t v1702 = vmulq_f32(v1642, v1715); - *(int16x4_t *)v3494 = v1346; + vst1_s16((int16_t *)v3494, v1346); int16x4_t v1156 = vqmovn_s32(vcvtq_n_s32_f32(v1153, 15)); int16x4_t v1187 = vqmovn_s32(vcvtq_n_s32_f32(v1184, 15)); float32x4_t v1198 = vsubq_f32(v1197, v1153); @@ -17943,27 +17943,27 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu25(const armral_cmplx_f32_t *restrict x, int16x4_t v1692 = vqmovn_s32(vcvtq_n_s32_f32(v1689, 15)); float32x4_t v1703 = vsubq_f32(v1702, v1689); float32x4_t v1716 = vmulq_f32(v1648, v1715); - *(int16x4_t *)v3449 = v1173; - *(int16x4_t *)v3485 = v1329; - *(int16x4_t *)v3503 = v1360; - *(int16x4_t *)v3539 = v1519; + vst1_s16((int16_t *)v3449, v1173); + vst1_s16((int16_t *)v3485, v1329); + vst1_s16((int16_t *)v3503, v1360); + vst1_s16((int16_t *)v3539, v1519); int16x4_t v1201 = vqmovn_s32(vcvtq_n_s32_f32(v1198, 15)); int16x4_t v1547 = vqmovn_s32(vcvtq_n_s32_f32(v1544, 15)); int16x4_t v1675 = vqmovn_s32(vcvtq_n_s32_f32(v1672, 15)); int16x4_t v1706 = vqmovn_s32(vcvtq_n_s32_f32(v1703, 15)); float32x4_t v1717 = vsubq_f32(v1716, v1672); - *(int16x4_t *)v3440 = v1156; - *(int16x4_t *)v3458 = v1187; - *(int16x4_t *)v3512 = v1374; - *(int16x4_t *)v3530 = v1502; - *(int16x4_t *)v3548 = v1533; - *(int16x4_t *)v3584 = v1692; + vst1_s16((int16_t *)v3440, v1156); + vst1_s16((int16_t *)v3458, v1187); + vst1_s16((int16_t *)v3512, v1374); + vst1_s16((int16_t *)v3530, v1502); + vst1_s16((int16_t *)v3548, v1533); + vst1_s16((int16_t *)v3584, v1692); int16x4_t v1720 = vqmovn_s32(vcvtq_n_s32_f32(v1717, 15)); - *(int16x4_t *)v3467 = v1201; - *(int16x4_t *)v3557 = v1547; - *(int16x4_t *)v3575 = v1675; - *(int16x4_t *)v3593 = v1706; - *(int16x4_t *)v3602 = v1720; + vst1_s16((int16_t *)v3467, v1201); + vst1_s16((int16_t *)v3557, v1547); + vst1_s16((int16_t *)v3575, v1675); + vst1_s16((int16_t *)v3593, v1706); + vst1_s16((int16_t *)v3602, v1720); v5 += 2 * 1; v6 += 2 * 1; } @@ -19361,7 +19361,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu32(const armral_cmplx_f32_t *restrict x, float32x2_t v1167 = (float32x2_t){v1165, v1166}; const float32x2_t *v2213 = &v5[0]; int32_t *v2502 = &v6[0]; - float32x4_t v2817 = *(const float32x4_t *)v2357; + float32x4_t v2817 = vld1q_f32((const float32_t *)v2357); float32x4_t v694 = vcombine_f32(v693, v693); float32x4_t v768 = vcombine_f32(v767, v767); float32x2_t v774 = vmul_f32(v1168, v772); @@ -19444,7 +19444,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu32(const armral_cmplx_f32_t *restrict x, int32_t *v2763 = &v6[ostride * 15]; int32_t *v2772 = &v6[ostride * 23]; int32_t *v2781 = &v6[ostride * 31]; - float32x4_t v2785 = *(const float32x4_t *)v2213; + float32x4_t v2785 = vld1q_f32((const float32_t *)v2213); float32x4_t v776 = vcombine_f32(v774, v774); float32x4_t v850 = vcombine_f32(v848, v848); float32x4_t v937 = vcombine_f32(v935, v935); @@ -19455,36 +19455,36 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu32(const armral_cmplx_f32_t *restrict x, float32x4_t v1146 = vcombine_f32(v1144, v1144); float32x4_t v1159 = vcombine_f32(v1157, v1157); float32x4_t v1171 = vcombine_f32(v1169, v1169); - float32x4_t v2787 = *(const float32x4_t *)v2222; - float32x4_t v2789 = *(const float32x4_t *)v2231; - float32x4_t v2791 = *(const float32x4_t *)v2240; - float32x4_t v2793 = *(const float32x4_t *)v2249; - float32x4_t v2795 = *(const float32x4_t *)v2258; - float32x4_t v2797 = *(const float32x4_t *)v2267; - float32x4_t v2799 = *(const float32x4_t *)v2276; - float32x4_t v2801 = *(const float32x4_t *)v2285; - float32x4_t v2803 = *(const float32x4_t *)v2294; - float32x4_t v2805 = *(const float32x4_t *)v2303; - float32x4_t v2807 = *(const float32x4_t *)v2312; - float32x4_t v2809 = *(const float32x4_t *)v2321; - float32x4_t v2811 = *(const float32x4_t *)v2330; - float32x4_t v2813 = *(const float32x4_t *)v2339; - float32x4_t v2815 = *(const float32x4_t *)v2348; - float32x4_t v2819 = *(const float32x4_t *)v2366; - float32x4_t v2821 = *(const float32x4_t *)v2375; - float32x4_t v2823 = *(const float32x4_t *)v2384; - float32x4_t v2825 = *(const float32x4_t *)v2393; - float32x4_t v2827 = *(const float32x4_t *)v2402; - float32x4_t v2829 = *(const float32x4_t *)v2411; - float32x4_t v2831 = *(const float32x4_t *)v2420; - float32x4_t v2833 = *(const float32x4_t *)v2429; - float32x4_t v2835 = *(const float32x4_t *)v2438; - float32x4_t v2837 = *(const float32x4_t *)v2447; - float32x4_t v2839 = *(const float32x4_t *)v2456; - float32x4_t v2841 = *(const float32x4_t *)v2465; - float32x4_t v2843 = *(const float32x4_t *)v2474; - float32x4_t v2845 = *(const float32x4_t *)v2483; - float32x4_t v2847 = *(const float32x4_t *)v2492; + float32x4_t v2787 = vld1q_f32((const float32_t *)v2222); + float32x4_t v2789 = vld1q_f32((const float32_t *)v2231); + float32x4_t v2791 = vld1q_f32((const float32_t *)v2240); + float32x4_t v2793 = vld1q_f32((const float32_t *)v2249); + float32x4_t v2795 = vld1q_f32((const float32_t *)v2258); + float32x4_t v2797 = vld1q_f32((const float32_t *)v2267); + float32x4_t v2799 = vld1q_f32((const float32_t *)v2276); + float32x4_t v2801 = vld1q_f32((const float32_t *)v2285); + float32x4_t v2803 = vld1q_f32((const float32_t *)v2294); + float32x4_t v2805 = vld1q_f32((const float32_t *)v2303); + float32x4_t v2807 = vld1q_f32((const float32_t *)v2312); + float32x4_t v2809 = vld1q_f32((const float32_t *)v2321); + float32x4_t v2811 = vld1q_f32((const float32_t *)v2330); + float32x4_t v2813 = vld1q_f32((const float32_t *)v2339); + float32x4_t v2815 = vld1q_f32((const float32_t *)v2348); + float32x4_t v2819 = vld1q_f32((const float32_t *)v2366); + float32x4_t v2821 = vld1q_f32((const float32_t *)v2375); + float32x4_t v2823 = vld1q_f32((const float32_t *)v2384); + float32x4_t v2825 = vld1q_f32((const float32_t *)v2393); + float32x4_t v2827 = vld1q_f32((const float32_t *)v2402); + float32x4_t v2829 = vld1q_f32((const float32_t *)v2411); + float32x4_t v2831 = vld1q_f32((const float32_t *)v2420); + float32x4_t v2833 = vld1q_f32((const float32_t *)v2429); + float32x4_t v2835 = vld1q_f32((const float32_t *)v2438); + float32x4_t v2837 = vld1q_f32((const float32_t *)v2447); + float32x4_t v2839 = vld1q_f32((const float32_t *)v2456); + float32x4_t v2841 = vld1q_f32((const float32_t *)v2465); + float32x4_t v2843 = vld1q_f32((const float32_t *)v2474); + float32x4_t v2845 = vld1q_f32((const float32_t *)v2483); + float32x4_t v2847 = vld1q_f32((const float32_t *)v2492); float32x4_t v35 = vaddq_f32(v2785, v2787); float32x4_t v36 = vsubq_f32(v2785, v2787); float32x4_t v51 = vaddq_f32(v2789, v2791); @@ -19666,8 +19666,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu32(const armral_cmplx_f32_t *restrict x, float32x4_t v658 = vaddq_f32(v252, v654); float32x4_t v939 = vaddq_f32(v917, v925); float32x4_t v940 = vaddq_f32(v930, v938); - *(int16x4_t *)v2502 = v661; - *(int16x4_t *)v2520 = v677; + vst1_s16((int16_t *)v2502, v661); + vst1_s16((int16_t *)v2520, v677); float32x4_t v148 = vmulq_f32(v146, v1171); float32x4_t v283 = vaddq_f32(v281, v282); float32x4_t v284 = vsubq_f32(v282, v281); @@ -19712,8 +19712,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu32(const armral_cmplx_f32_t *restrict x, float32x4_t v952 = vsubq_f32(v253, v941); float32x4_t v1089 = vaddq_f32(v1087, v1088); float32x4_t v1090 = vsubq_f32(v1088, v1087); - *(int16x4_t *)v2511 = v669; - *(int16x4_t *)v2529 = v685; + vst1_s16((int16_t *)v2511, v669); + vst1_s16((int16_t *)v2529, v685); float32x4_t v292 = vmulq_f32(v290, v1171); float32x4_t v337 = vsubq_f32(v110, v334); float32x4_t v338 = vaddq_f32(v110, v334); @@ -19755,8 +19755,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu32(const armral_cmplx_f32_t *restrict x, float32x4_t v1098 = vmulq_f32(v1096, v1171); float32x4_t v1099 = vaddq_f32(v337, v1089); float32x4_t v1100 = vsubq_f32(v337, v1089); - *(int16x4_t *)v2646 = v957; - *(int16x4_t *)v2664 = v973; + vst1_s16((int16_t *)v2646, v957); + vst1_s16((int16_t *)v2664, v973); float32x4_t v726 = vrev64q_f32(v720); float32x4_t v729 = vaddq_f32(v293, v719); float32x4_t v730 = vsubq_f32(v293, v719); @@ -19775,8 +19775,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu32(const armral_cmplx_f32_t *restrict x, int16x4_t v1121 = vqmovn_s32(vcvtq_n_s32_f32(v1100, 15)); float32x4_t v1161 = vfmaq_f32(v1139, v1145, v1146); float32x4_t v1162 = vfmaq_f32(v1152, v1158, v1159); - *(int16x4_t *)v2574 = v809; - *(int16x4_t *)v2592 = v825; + vst1_s16((int16_t *)v2574, v809); + vst1_s16((int16_t *)v2592, v825); float32x4_t v728 = vmulq_f32(v726, v1171); int16x4_t v735 = vqmovn_s32(vcvtq_n_s32_f32(v729, 15)); int16x4_t v751 = vqmovn_s32(vcvtq_n_s32_f32(v730, 15)); @@ -19791,10 +19791,10 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu32(const armral_cmplx_f32_t *restrict x, int16x4_t v1129 = vqmovn_s32(vcvtq_n_s32_f32(v1102, 15)); float32x4_t v1163 = vaddq_f32(v1161, v1162); float32x4_t v1164 = vsubq_f32(v1162, v1161); - *(int16x4_t *)v2655 = v965; - *(int16x4_t *)v2673 = v981; - *(int16x4_t *)v2718 = v1105; - *(int16x4_t *)v2736 = v1121; + vst1_s16((int16_t *)v2655, v965); + vst1_s16((int16_t *)v2673, v981); + vst1_s16((int16_t *)v2718, v1105); + vst1_s16((int16_t *)v2736, v1121); float32x4_t v731 = vsubq_f32(v294, v728); float32x4_t v732 = vaddq_f32(v294, v728); float32x4_t v874 = vrev64q_f32(v868); @@ -19805,14 +19805,14 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu32(const armral_cmplx_f32_t *restrict x, float32x4_t v1170 = vrev64q_f32(v1164); float32x4_t v1173 = vaddq_f32(v379, v1163); float32x4_t v1174 = vsubq_f32(v379, v1163); - *(int16x4_t *)v2538 = v735; - *(int16x4_t *)v2556 = v751; - *(int16x4_t *)v2583 = v817; - *(int16x4_t *)v2601 = v833; - *(int16x4_t *)v2682 = v1031; - *(int16x4_t *)v2700 = v1047; - *(int16x4_t *)v2727 = v1113; - *(int16x4_t *)v2745 = v1129; + vst1_s16((int16_t *)v2538, v735); + vst1_s16((int16_t *)v2556, v751); + vst1_s16((int16_t *)v2583, v817); + vst1_s16((int16_t *)v2601, v833); + vst1_s16((int16_t *)v2682, v1031); + vst1_s16((int16_t *)v2700, v1047); + vst1_s16((int16_t *)v2727, v1113); + vst1_s16((int16_t *)v2745, v1129); int16x4_t v743 = vqmovn_s32(vcvtq_n_s32_f32(v731, 15)); int16x4_t v759 = vqmovn_s32(vcvtq_n_s32_f32(v732, 15)); float32x4_t v876 = vmulq_f32(v874, v1171); @@ -19827,22 +19827,22 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu32(const armral_cmplx_f32_t *restrict x, float32x4_t v880 = vaddq_f32(v378, v876); float32x4_t v1175 = vsubq_f32(v380, v1172); float32x4_t v1176 = vaddq_f32(v380, v1172); - *(int16x4_t *)v2547 = v743; - *(int16x4_t *)v2565 = v759; - *(int16x4_t *)v2610 = v883; - *(int16x4_t *)v2628 = v899; - *(int16x4_t *)v2691 = v1039; - *(int16x4_t *)v2709 = v1055; - *(int16x4_t *)v2754 = v1179; - *(int16x4_t *)v2772 = v1195; + vst1_s16((int16_t *)v2547, v743); + vst1_s16((int16_t *)v2565, v759); + vst1_s16((int16_t *)v2610, v883); + vst1_s16((int16_t *)v2628, v899); + vst1_s16((int16_t *)v2691, v1039); + vst1_s16((int16_t *)v2709, v1055); + vst1_s16((int16_t *)v2754, v1179); + vst1_s16((int16_t *)v2772, v1195); int16x4_t v891 = vqmovn_s32(vcvtq_n_s32_f32(v879, 15)); int16x4_t v907 = vqmovn_s32(vcvtq_n_s32_f32(v880, 15)); int16x4_t v1187 = vqmovn_s32(vcvtq_n_s32_f32(v1175, 15)); int16x4_t v1203 = vqmovn_s32(vcvtq_n_s32_f32(v1176, 15)); - *(int16x4_t *)v2619 = v891; - *(int16x4_t *)v2637 = v907; - *(int16x4_t *)v2763 = v1187; - *(int16x4_t *)v2781 = v1203; + vst1_s16((int16_t *)v2619, v891); + vst1_s16((int16_t *)v2637, v907); + vst1_s16((int16_t *)v2763, v1187); + vst1_s16((int16_t *)v2781, v1203); v5 += 2 * 1; v6 += 2 * 1; } diff --git a/src/LowerPHY/FFT/fft_cs16.cpp b/src/LowerPHY/FFT/fft_cs16.cpp index c9cb8f797a504786960361cb5a1ddd08ff70a40f..856eda0584ebd54581291709f8bf97015272f440 100644 --- a/src/LowerPHY/FFT/fft_cs16.cpp +++ b/src/LowerPHY/FFT/fft_cs16.cpp @@ -11,7 +11,7 @@ extern "C" { armral_status armral_fft_create_plan_cs16(armral_fft_plan_t **p, int n, armral_fft_direction_t dir) { return armral::fft::create_plan(p, n, dir); + armral_cmplx_f32_t>(p, n, dir, true); } armral_status armral_fft_execute_cs16(const armral_fft_plan_t *p, diff --git a/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c b/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c index d00d3766c37d291b1c9b211ba1db2c2a967be6c5..2181663e18674b8c0cd5dc82fa1e8b38653b5bf5 100644 --- a/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c +++ b/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c @@ -45,7 +45,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu7(const armral_cmplx_int16_t *restrict x, float32x2_t v145 = (float32x2_t){v143, v144}; const int32_t *v453 = &v5[0]; float32x2_t *v463 = &v6[0]; - int16x4_t v521 = *(const int16x4_t *)v398; + int16x4_t v521 = vld1_s16((const int16_t *)v398); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v521), 15); float32x4_t v102 = vcombine_f32(v101, v101); float32x4_t v107 = vcombine_f32(v106, v106); @@ -65,17 +65,17 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu7(const armral_cmplx_int16_t *restrict x, float32x2_t *v499 = &v6[ostride * 4]; float32x2_t *v508 = &v6[ostride * 5]; float32x2_t *v517 = &v6[ostride * 6]; - int16x4_t v533 = *(const int16x4_t *)v453; + int16x4_t v533 = vld1_s16((const int16_t *)v453); float32x4_t v84 = vcvtq_n_f32_s32(vmovl_s16(v533), 15); float32x4_t v125 = vcombine_f32(v123, v123); float32x4_t v133 = vcombine_f32(v131, v131); float32x4_t v141 = vcombine_f32(v139, v139); float32x4_t v149 = vcombine_f32(v147, v147); - int16x4_t v523 = *(const int16x4_t *)v407; - int16x4_t v525 = *(const int16x4_t *)v416; - int16x4_t v527 = *(const int16x4_t *)v425; - int16x4_t v529 = *(const int16x4_t *)v434; - int16x4_t v531 = *(const int16x4_t *)v443; + int16x4_t v523 = vld1_s16((const int16_t *)v407); + int16x4_t v525 = vld1_s16((const int16_t *)v416); + int16x4_t v527 = vld1_s16((const int16_t *)v425); + int16x4_t v529 = vld1_s16((const int16_t *)v434); + int16x4_t v531 = vld1_s16((const int16_t *)v443); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v523), 15); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v525), 15); float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v527), 15); @@ -111,7 +111,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu7(const armral_cmplx_int16_t *restrict x, float32x4_t v150 = vmulq_f32(v148, v149); float32x4_t v126 = vmulq_f32(v124, v125); float32x4_t v151 = vaddq_f32(v85, v103); - *(float32x4_t *)v463 = v85; + vst1q_f32((float32_t *)v463, v85); float32x4_t v152 = vaddq_f32(v151, v108); float32x4_t v154 = vsubq_f32(v151, v108); float32x4_t v156 = vsubq_f32(v151, v113); @@ -130,12 +130,12 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu7(const armral_cmplx_int16_t *restrict x, float32x4_t v167 = vsubq_f32(v155, v161); float32x4_t v168 = vaddq_f32(v157, v163); float32x4_t v169 = vsubq_f32(v157, v163); - *(float32x4_t *)v472 = v165; - *(float32x4_t *)v481 = v167; - *(float32x4_t *)v490 = v168; - *(float32x4_t *)v499 = v169; - *(float32x4_t *)v508 = v166; - *(float32x4_t *)v517 = v164; + vst1q_f32((float32_t *)v472, v165); + vst1q_f32((float32_t *)v481, v167); + vst1q_f32((float32_t *)v490, v168); + vst1q_f32((float32_t *)v499, v169); + vst1q_f32((float32_t *)v508, v166); + vst1q_f32((float32_t *)v517, v164); v5 += 2 * 1; v6 += 2 * 1; } @@ -463,7 +463,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu9(const armral_cmplx_int16_t *restrict x, float32x2_t v177 = (float32x2_t){v175, v176}; const int32_t *v565 = &v5[0]; float32x2_t *v575 = &v6[0]; - int16x4_t v651 = *(const int16x4_t *)v492; + int16x4_t v651 = vld1_s16((const int16_t *)v492); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v651), 15); float32x4_t v121 = vcombine_f32(v120, v120); float32x4_t v134 = vcombine_f32(v133, v133); @@ -488,19 +488,19 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu9(const armral_cmplx_int16_t *restrict x, float32x2_t *v629 = &v6[ostride * 6]; float32x2_t *v638 = &v6[ostride * 7]; float32x2_t *v647 = &v6[ostride * 8]; - int16x4_t v667 = *(const int16x4_t *)v565; + int16x4_t v667 = vld1_s16((const int16_t *)v565); float32x4_t v103 = vcvtq_n_f32_s32(vmovl_s16(v667), 15); float32x4_t v142 = vcombine_f32(v140, v140); float32x4_t v165 = vcombine_f32(v163, v163); float32x4_t v173 = vcombine_f32(v171, v171); float32x4_t v181 = vcombine_f32(v179, v179); - int16x4_t v653 = *(const int16x4_t *)v501; - int16x4_t v655 = *(const int16x4_t *)v510; - int16x4_t v657 = *(const int16x4_t *)v519; - int16x4_t v659 = *(const int16x4_t *)v528; - int16x4_t v661 = *(const int16x4_t *)v537; - int16x4_t v663 = *(const int16x4_t *)v546; - int16x4_t v665 = *(const int16x4_t *)v555; + int16x4_t v653 = vld1_s16((const int16_t *)v501); + int16x4_t v655 = vld1_s16((const int16_t *)v510); + int16x4_t v657 = vld1_s16((const int16_t *)v519); + int16x4_t v659 = vld1_s16((const int16_t *)v528); + int16x4_t v661 = vld1_s16((const int16_t *)v537); + int16x4_t v663 = vld1_s16((const int16_t *)v546); + int16x4_t v665 = vld1_s16((const int16_t *)v555); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v653), 15); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v655), 15); float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v657), 15); @@ -552,7 +552,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu9(const armral_cmplx_int16_t *restrict x, float32x4_t v197 = vaddq_f32(v196, v174); float32x4_t v199 = vaddq_f32(v198, v182); float32x4_t v201 = vsubq_f32(v200, v182); - *(float32x4_t *)v575 = v104; + vst1q_f32((float32_t *)v575, v104); float32x4_t v185 = vaddq_f32(v104, v184); float32x4_t v189 = vaddq_f32(v188, v183); float32x4_t v186 = vaddq_f32(v185, v130); @@ -563,20 +563,20 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu9(const armral_cmplx_int16_t *restrict x, float32x4_t v191 = vaddq_f32(v190, v153); float32x4_t v193 = vaddq_f32(v192, v158); float32x4_t v195 = vsubq_f32(v194, v158); - *(float32x4_t *)v602 = v187; - *(float32x4_t *)v629 = v186; + vst1q_f32((float32_t *)v602, v187); + vst1q_f32((float32_t *)v629, v186); float32x4_t v202 = vaddq_f32(v191, v197); float32x4_t v203 = vsubq_f32(v191, v197); float32x4_t v204 = vaddq_f32(v193, v199); float32x4_t v205 = vsubq_f32(v193, v199); float32x4_t v206 = vaddq_f32(v195, v201); float32x4_t v207 = vsubq_f32(v195, v201); - *(float32x4_t *)v584 = v203; - *(float32x4_t *)v593 = v204; - *(float32x4_t *)v611 = v207; - *(float32x4_t *)v620 = v206; - *(float32x4_t *)v638 = v205; - *(float32x4_t *)v647 = v202; + vst1q_f32((float32_t *)v584, v203); + vst1q_f32((float32_t *)v593, v204); + vst1q_f32((float32_t *)v611, v207); + vst1q_f32((float32_t *)v620, v206); + vst1q_f32((float32_t *)v638, v205); + vst1q_f32((float32_t *)v647, v202); v5 += 2 * 1; v6 += 2 * 1; } @@ -996,7 +996,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu11(const armral_cmplx_int16_t *restrict x, float32x2_t v275 = (float32x2_t){v273, v274}; const int32_t *v833 = &v5[0]; float32x2_t *v843 = &v6[0]; - int16x4_t v937 = *(const int16x4_t *)v742; + int16x4_t v937 = vld1_s16((const int16_t *)v742); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v937), 15); float32x4_t v154 = vcombine_f32(v153, v153); float32x2_t v160 = vmul_f32(v276, v158); @@ -1036,7 +1036,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu11(const armral_cmplx_int16_t *restrict x, float32x2_t *v906 = &v6[ostride * 4]; float32x2_t *v915 = &v6[ostride * 3]; float32x2_t *v924 = &v6[ostride * 2]; - int16x4_t v957 = *(const int16x4_t *)v833; + int16x4_t v957 = vld1_s16((const int16_t *)v833); float32x4_t v124 = vcvtq_n_f32_s32(vmovl_s16(v957), 15); float32x4_t v162 = vcombine_f32(v160, v160); float32x4_t v215 = vcombine_f32(v213, v213); @@ -1048,15 +1048,15 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu11(const armral_cmplx_int16_t *restrict x, float32x4_t v263 = vcombine_f32(v261, v261); float32x4_t v271 = vcombine_f32(v269, v269); float32x4_t v279 = vcombine_f32(v277, v277); - int16x4_t v939 = *(const int16x4_t *)v751; - int16x4_t v941 = *(const int16x4_t *)v760; - int16x4_t v943 = *(const int16x4_t *)v769; - int16x4_t v945 = *(const int16x4_t *)v778; - int16x4_t v947 = *(const int16x4_t *)v787; - int16x4_t v949 = *(const int16x4_t *)v796; - int16x4_t v951 = *(const int16x4_t *)v805; - int16x4_t v953 = *(const int16x4_t *)v814; - int16x4_t v955 = *(const int16x4_t *)v823; + int16x4_t v939 = vld1_s16((const int16_t *)v751); + int16x4_t v941 = vld1_s16((const int16_t *)v760); + int16x4_t v943 = vld1_s16((const int16_t *)v769); + int16x4_t v945 = vld1_s16((const int16_t *)v778); + int16x4_t v947 = vld1_s16((const int16_t *)v787); + int16x4_t v949 = vld1_s16((const int16_t *)v796); + int16x4_t v951 = vld1_s16((const int16_t *)v805); + int16x4_t v953 = vld1_s16((const int16_t *)v814); + int16x4_t v955 = vld1_s16((const int16_t *)v823); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v939), 15); float32x4_t v45 = vcvtq_n_f32_s32(vmovl_s16(v941), 15); float32x4_t v53 = vcvtq_n_f32_s32(vmovl_s16(v943), 15); @@ -1150,7 +1150,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu11(const armral_cmplx_int16_t *restrict x, float32x4_t v295 = vsubq_f32(v264, v280); float32x4_t v296 = vaddq_f32(v286, v288); float32x4_t v314 = vaddq_f32(v290, v291); - *(float32x4_t *)v843 = v125; + vst1q_f32((float32_t *)v843, v125); float32x4_t v297 = vaddq_f32(v296, v281); float32x4_t v298 = vsubq_f32(v281, v283); float32x4_t v300 = vaddq_f32(v281, v287); @@ -1181,16 +1181,16 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu11(const armral_cmplx_int16_t *restrict x, float32x4_t v326 = vsubq_f32(v297, v307); float32x4_t v318 = vaddq_f32(v305, v317); float32x4_t v327 = vsubq_f32(v305, v317); - *(float32x4_t *)v861 = v319; - *(float32x4_t *)v870 = v320; - *(float32x4_t *)v879 = v321; - *(float32x4_t *)v888 = v322; - *(float32x4_t *)v897 = v323; - *(float32x4_t *)v906 = v324; - *(float32x4_t *)v915 = v325; - *(float32x4_t *)v924 = v326; - *(float32x4_t *)v852 = v318; - *(float32x4_t *)v933 = v327; + vst1q_f32((float32_t *)v861, v319); + vst1q_f32((float32_t *)v870, v320); + vst1q_f32((float32_t *)v879, v321); + vst1q_f32((float32_t *)v888, v322); + vst1q_f32((float32_t *)v897, v323); + vst1q_f32((float32_t *)v906, v324); + vst1q_f32((float32_t *)v915, v325); + vst1q_f32((float32_t *)v924, v326); + vst1q_f32((float32_t *)v852, v318); + vst1q_f32((float32_t *)v933, v327); v5 += 2 * 1; v6 += 2 * 1; } @@ -1842,7 +1842,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu13(const armral_cmplx_int16_t *restrict x, float32x2_t v303 = (float32x2_t){v301, v302}; const int32_t *v935 = &v5[0]; float32x2_t *v945 = &v6[0]; - int16x4_t v1057 = *(const int16x4_t *)v826; + int16x4_t v1057 = vld1_s16((const int16_t *)v826); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1057), 15); float32x4_t v176 = vcombine_f32(v175, v175); float32x4_t v181 = vcombine_f32(v180, v180); @@ -1886,7 +1886,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu13(const armral_cmplx_int16_t *restrict x, float32x2_t *v1026 = &v6[ostride * 4]; float32x2_t *v1035 = &v6[ostride * 3]; float32x2_t *v1044 = &v6[ostride * 2]; - int16x4_t v1081 = *(const int16x4_t *)v935; + int16x4_t v1081 = vld1_s16((const int16_t *)v935); float32x4_t v159 = vcvtq_n_f32_s32(vmovl_s16(v1081), 15); float32x4_t v189 = vcombine_f32(v187, v187); float32x4_t v197 = vcombine_f32(v195, v195); @@ -1900,17 +1900,17 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu13(const armral_cmplx_int16_t *restrict x, float32x4_t v291 = vcombine_f32(v289, v289); float32x4_t v299 = vcombine_f32(v297, v297); float32x4_t v307 = vcombine_f32(v305, v305); - int16x4_t v1059 = *(const int16x4_t *)v835; - int16x4_t v1061 = *(const int16x4_t *)v844; - int16x4_t v1063 = *(const int16x4_t *)v853; - int16x4_t v1065 = *(const int16x4_t *)v862; - int16x4_t v1067 = *(const int16x4_t *)v871; - int16x4_t v1069 = *(const int16x4_t *)v880; - int16x4_t v1071 = *(const int16x4_t *)v889; - int16x4_t v1073 = *(const int16x4_t *)v898; - int16x4_t v1075 = *(const int16x4_t *)v907; - int16x4_t v1077 = *(const int16x4_t *)v916; - int16x4_t v1079 = *(const int16x4_t *)v925; + int16x4_t v1059 = vld1_s16((const int16_t *)v835); + int16x4_t v1061 = vld1_s16((const int16_t *)v844); + int16x4_t v1063 = vld1_s16((const int16_t *)v853); + int16x4_t v1065 = vld1_s16((const int16_t *)v862); + int16x4_t v1067 = vld1_s16((const int16_t *)v871); + int16x4_t v1069 = vld1_s16((const int16_t *)v880); + int16x4_t v1071 = vld1_s16((const int16_t *)v889); + int16x4_t v1073 = vld1_s16((const int16_t *)v898); + int16x4_t v1075 = vld1_s16((const int16_t *)v907); + int16x4_t v1077 = vld1_s16((const int16_t *)v916); + int16x4_t v1079 = vld1_s16((const int16_t *)v925); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1059), 15); float32x4_t v45 = vcvtq_n_f32_s32(vmovl_s16(v1061), 15); float32x4_t v53 = vcvtq_n_f32_s32(vmovl_s16(v1063), 15); @@ -2008,7 +2008,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu13(const armral_cmplx_int16_t *restrict x, float32x4_t v325 = vsubq_f32(v276, v284); float32x4_t v326 = vsubq_f32(v292, v308); float32x4_t v327 = vaddq_f32(v300, v308); - *(float32x4_t *)v945 = v160; + vst1q_f32((float32_t *)v945, v160); float32x4_t v313 = vaddq_f32(v312, v182); float32x4_t v315 = vsubq_f32(v314, v182); float32x4_t v316 = vaddq_f32(v309, v226); @@ -2049,18 +2049,18 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu13(const armral_cmplx_int16_t *restrict x, float32x4_t v357 = vaddq_f32(v332, v341); float32x4_t v358 = vsubq_f32(v331, v339); float32x4_t v359 = vaddq_f32(v330, v337); - *(float32x4_t *)v954 = v348; - *(float32x4_t *)v963 = v349; - *(float32x4_t *)v972 = v350; - *(float32x4_t *)v981 = v351; - *(float32x4_t *)v990 = v352; - *(float32x4_t *)v999 = v353; - *(float32x4_t *)v1008 = v354; - *(float32x4_t *)v1017 = v355; - *(float32x4_t *)v1026 = v356; - *(float32x4_t *)v1035 = v357; - *(float32x4_t *)v1044 = v358; - *(float32x4_t *)v1053 = v359; + vst1q_f32((float32_t *)v954, v348); + vst1q_f32((float32_t *)v963, v349); + vst1q_f32((float32_t *)v972, v350); + vst1q_f32((float32_t *)v981, v351); + vst1q_f32((float32_t *)v990, v352); + vst1q_f32((float32_t *)v999, v353); + vst1q_f32((float32_t *)v1008, v354); + vst1q_f32((float32_t *)v1017, v355); + vst1q_f32((float32_t *)v1026, v356); + vst1q_f32((float32_t *)v1035, v357); + vst1q_f32((float32_t *)v1044, v358); + vst1q_f32((float32_t *)v1053, v359); v5 += 2 * 1; v6 += 2 * 1; } @@ -2748,7 +2748,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu14(const armral_cmplx_int16_t *restrict x, float32x2_t v308 = (float32x2_t){v306, v307}; const int32_t *v786 = &v5[0]; float32x2_t *v913 = &v6[0]; - int16x4_t v1052 = *(const int16x4_t *)v867; + int16x4_t v1052 = vld1_s16((const int16_t *)v867); float32x4_t v108 = vcvtq_n_f32_s32(vmovl_s16(v1052), 15); float32x4_t v265 = vcombine_f32(v264, v264); float32x4_t v270 = vcombine_f32(v269, v269); @@ -2782,24 +2782,24 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu14(const armral_cmplx_int16_t *restrict x, float32x2_t *v1012 = &v6[ostride * 5]; float32x2_t *v1021 = &v6[ostride * 6]; float32x2_t *v1030 = &v6[ostride * 13]; - int16x4_t v1034 = *(const int16x4_t *)v786; + int16x4_t v1034 = vld1_s16((const int16_t *)v786); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1034), 15); float32x4_t v288 = vcombine_f32(v286, v286); float32x4_t v296 = vcombine_f32(v294, v294); float32x4_t v304 = vcombine_f32(v302, v302); float32x4_t v312 = vcombine_f32(v310, v310); - int16x4_t v1036 = *(const int16x4_t *)v795; - int16x4_t v1038 = *(const int16x4_t *)v804; - int16x4_t v1040 = *(const int16x4_t *)v813; - int16x4_t v1042 = *(const int16x4_t *)v822; - int16x4_t v1044 = *(const int16x4_t *)v831; - int16x4_t v1046 = *(const int16x4_t *)v840; - int16x4_t v1048 = *(const int16x4_t *)v849; - int16x4_t v1050 = *(const int16x4_t *)v858; - int16x4_t v1054 = *(const int16x4_t *)v876; - int16x4_t v1056 = *(const int16x4_t *)v885; - int16x4_t v1058 = *(const int16x4_t *)v894; - int16x4_t v1060 = *(const int16x4_t *)v903; + int16x4_t v1036 = vld1_s16((const int16_t *)v795); + int16x4_t v1038 = vld1_s16((const int16_t *)v804); + int16x4_t v1040 = vld1_s16((const int16_t *)v813); + int16x4_t v1042 = vld1_s16((const int16_t *)v822); + int16x4_t v1044 = vld1_s16((const int16_t *)v831); + int16x4_t v1046 = vld1_s16((const int16_t *)v840); + int16x4_t v1048 = vld1_s16((const int16_t *)v849); + int16x4_t v1050 = vld1_s16((const int16_t *)v858); + int16x4_t v1054 = vld1_s16((const int16_t *)v876); + int16x4_t v1056 = vld1_s16((const int16_t *)v885); + int16x4_t v1058 = vld1_s16((const int16_t *)v894); + int16x4_t v1060 = vld1_s16((const int16_t *)v903); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1036), 15); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v1038), 15); float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v1040), 15); @@ -2886,8 +2886,8 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu14(const armral_cmplx_int16_t *restrict x, float32x4_t v221 = vaddq_f32(v155, v173); float32x4_t v289 = vmulq_f32(v287, v288); float32x4_t v314 = vaddq_f32(v248, v266); - *(float32x4_t *)v913 = v155; - *(float32x4_t *)v922 = v248; + vst1q_f32((float32_t *)v913, v155); + vst1q_f32((float32_t *)v922, v248); float32x4_t v222 = vaddq_f32(v221, v178); float32x4_t v224 = vsubq_f32(v221, v178); float32x4_t v226 = vsubq_f32(v221, v183); @@ -2924,18 +2924,18 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu14(const armral_cmplx_int16_t *restrict x, float32x4_t v330 = vsubq_f32(v318, v324); float32x4_t v331 = vaddq_f32(v320, v326); float32x4_t v332 = vsubq_f32(v320, v326); - *(float32x4_t *)v931 = v235; - *(float32x4_t *)v940 = v328; - *(float32x4_t *)v949 = v237; - *(float32x4_t *)v958 = v330; - *(float32x4_t *)v967 = v238; - *(float32x4_t *)v976 = v331; - *(float32x4_t *)v985 = v239; - *(float32x4_t *)v994 = v332; - *(float32x4_t *)v1003 = v236; - *(float32x4_t *)v1012 = v329; - *(float32x4_t *)v1021 = v234; - *(float32x4_t *)v1030 = v327; + vst1q_f32((float32_t *)v931, v235); + vst1q_f32((float32_t *)v940, v328); + vst1q_f32((float32_t *)v949, v237); + vst1q_f32((float32_t *)v958, v330); + vst1q_f32((float32_t *)v967, v238); + vst1q_f32((float32_t *)v976, v331); + vst1q_f32((float32_t *)v985, v239); + vst1q_f32((float32_t *)v994, v332); + vst1q_f32((float32_t *)v1003, v236); + vst1q_f32((float32_t *)v1012, v329); + vst1q_f32((float32_t *)v1021, v234); + vst1q_f32((float32_t *)v1030, v327); v5 += 2 * 1; v6 += 2 * 1; } @@ -3542,7 +3542,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu15(const armral_cmplx_int16_t *restrict x, float32x2_t v312 = (float32x2_t){v311, v311}; const int32_t *v826 = &v5[0]; float32x2_t *v944 = &v6[0]; - int16x4_t v1088 = *(const int16x4_t *)v871; + int16x4_t v1088 = vld1_s16((const int16_t *)v871); float32x4_t v90 = vcvtq_n_f32_s32(vmovl_s16(v1088), 15); float32x4_t v172 = vcombine_f32(v171, v171); float32x4_t v177 = vcombine_f32(v176, v176); @@ -3587,7 +3587,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu15(const armral_cmplx_int16_t *restrict x, float32x2_t *v1052 = &v6[ostride * 9]; float32x2_t *v1061 = &v6[ostride * 4]; float32x2_t *v1070 = &v6[ostride * 14]; - int16x4_t v1078 = *(const int16x4_t *)v826; + int16x4_t v1078 = vld1_s16((const int16_t *)v826); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v1078), 15); float32x4_t v185 = vcombine_f32(v183, v183); float32x4_t v193 = vcombine_f32(v191, v191); @@ -3598,19 +3598,19 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu15(const armral_cmplx_int16_t *restrict x, float32x4_t v282 = vcombine_f32(v280, v280); float32x4_t v290 = vcombine_f32(v288, v288); float32x4_t v298 = vcombine_f32(v296, v296); - int16x4_t v1074 = *(const int16x4_t *)v807; - int16x4_t v1076 = *(const int16x4_t *)v816; - int16x4_t v1080 = *(const int16x4_t *)v835; - int16x4_t v1082 = *(const int16x4_t *)v844; - int16x4_t v1084 = *(const int16x4_t *)v853; - int16x4_t v1086 = *(const int16x4_t *)v862; - int16x4_t v1090 = *(const int16x4_t *)v880; - int16x4_t v1092 = *(const int16x4_t *)v889; - int16x4_t v1094 = *(const int16x4_t *)v898; - int16x4_t v1096 = *(const int16x4_t *)v907; - int16x4_t v1098 = *(const int16x4_t *)v916; - int16x4_t v1100 = *(const int16x4_t *)v925; - int16x4_t v1102 = *(const int16x4_t *)v934; + int16x4_t v1074 = vld1_s16((const int16_t *)v807); + int16x4_t v1076 = vld1_s16((const int16_t *)v816); + int16x4_t v1080 = vld1_s16((const int16_t *)v835); + int16x4_t v1082 = vld1_s16((const int16_t *)v844); + int16x4_t v1084 = vld1_s16((const int16_t *)v853); + int16x4_t v1086 = vld1_s16((const int16_t *)v862); + int16x4_t v1090 = vld1_s16((const int16_t *)v880); + int16x4_t v1092 = vld1_s16((const int16_t *)v889); + int16x4_t v1094 = vld1_s16((const int16_t *)v898); + int16x4_t v1096 = vld1_s16((const int16_t *)v907); + int16x4_t v1098 = vld1_s16((const int16_t *)v916); + int16x4_t v1100 = vld1_s16((const int16_t *)v925); + int16x4_t v1102 = vld1_s16((const int16_t *)v934); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1074), 15); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1076), 15); float32x4_t v55 = vcvtq_n_f32_s32(vmovl_s16(v1080), 15); @@ -3696,7 +3696,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu15(const armral_cmplx_int16_t *restrict x, float32x4_t v263 = vaddq_f32(v250, v258); float32x4_t v283 = vmulq_f32(v281, v282); float32x4_t v324 = vaddq_f32(v163, v224); - *(float32x4_t *)v944 = v163; + vst1q_f32((float32_t *)v944, v163); float32x4_t v204 = vaddq_f32(v203, v178); float32x4_t v205 = vsubq_f32(v203, v178); float32x4_t v206 = vsubq_f32(v186, v194); @@ -3716,8 +3716,8 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu15(const armral_cmplx_int16_t *restrict x, float32x4_t v267 = vsubq_f32(v261, v263); float32x4_t v316 = vaddq_f32(v315, v299); float32x4_t v317 = vsubq_f32(v315, v299); - *(float32x4_t *)v953 = v326; - *(float32x4_t *)v962 = v325; + vst1q_f32((float32_t *)v953, v326); + vst1q_f32((float32_t *)v962, v325); float32x4_t v320 = vaddq_f32(v316, v318); float32x4_t v321 = vsubq_f32(v316, v318); float32x4_t v322 = vaddq_f32(v317, v319); @@ -3726,10 +3726,10 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu15(const armral_cmplx_int16_t *restrict x, float32x4_t v372 = vaddq_f32(v211, v267); float32x4_t v396 = vaddq_f32(v210, v266); float32x4_t v420 = vaddq_f32(v208, v264); - *(float32x4_t *)v971 = v209; - *(float32x4_t *)v998 = v211; - *(float32x4_t *)v1025 = v210; - *(float32x4_t *)v1052 = v208; + vst1q_f32((float32_t *)v971, v209); + vst1q_f32((float32_t *)v998, v211); + vst1q_f32((float32_t *)v1025, v210); + vst1q_f32((float32_t *)v1052, v208); float32x4_t v349 = vaddq_f32(v348, v321); float32x4_t v350 = vsubq_f32(v348, v321); float32x4_t v373 = vaddq_f32(v372, v323); @@ -3738,14 +3738,14 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu15(const armral_cmplx_int16_t *restrict x, float32x4_t v398 = vsubq_f32(v396, v322); float32x4_t v421 = vaddq_f32(v420, v320); float32x4_t v422 = vsubq_f32(v420, v320); - *(float32x4_t *)v980 = v350; - *(float32x4_t *)v989 = v349; - *(float32x4_t *)v1007 = v374; - *(float32x4_t *)v1016 = v373; - *(float32x4_t *)v1034 = v398; - *(float32x4_t *)v1043 = v397; - *(float32x4_t *)v1061 = v422; - *(float32x4_t *)v1070 = v421; + vst1q_f32((float32_t *)v980, v350); + vst1q_f32((float32_t *)v989, v349); + vst1q_f32((float32_t *)v1007, v374); + vst1q_f32((float32_t *)v1016, v373); + vst1q_f32((float32_t *)v1034, v398); + vst1q_f32((float32_t *)v1043, v397); + vst1q_f32((float32_t *)v1061, v422); + vst1q_f32((float32_t *)v1070, v421); v5 += 2 * 1; v6 += 2 * 1; } @@ -4382,7 +4382,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu16(const armral_cmplx_int16_t *restrict x, float32x2_t v300 = (float32x2_t){v299, v299}; const int32_t *v834 = &v5[0]; float32x2_t *v979 = &v6[0]; - int16x4_t v1134 = *(const int16x4_t *)v906; + int16x4_t v1134 = vld1_s16((const int16_t *)v906); float32x4_t v100 = vcvtq_n_f32_s32(vmovl_s16(v1134), 15); float32x2_t v247 = vmul_f32(v283, v245); float32x2_t v255 = vmul_f32(v283, v253); @@ -4421,27 +4421,27 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu16(const armral_cmplx_int16_t *restrict x, float32x2_t *v1096 = &v6[ostride * 13]; float32x2_t *v1105 = &v6[ostride * 14]; float32x2_t *v1114 = &v6[ostride * 15]; - int16x4_t v1118 = *(const int16x4_t *)v834; + int16x4_t v1118 = vld1_s16((const int16_t *)v834); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1118), 15); float32x4_t v249 = vcombine_f32(v247, v247); float32x4_t v257 = vcombine_f32(v255, v255); float32x4_t v270 = vcombine_f32(v268, v268); float32x4_t v278 = vcombine_f32(v276, v276); float32x4_t v286 = vcombine_f32(v284, v284); - int16x4_t v1120 = *(const int16x4_t *)v843; - int16x4_t v1122 = *(const int16x4_t *)v852; - int16x4_t v1124 = *(const int16x4_t *)v861; - int16x4_t v1126 = *(const int16x4_t *)v870; - int16x4_t v1128 = *(const int16x4_t *)v879; - int16x4_t v1130 = *(const int16x4_t *)v888; - int16x4_t v1132 = *(const int16x4_t *)v897; - int16x4_t v1136 = *(const int16x4_t *)v915; - int16x4_t v1138 = *(const int16x4_t *)v924; - int16x4_t v1140 = *(const int16x4_t *)v933; - int16x4_t v1142 = *(const int16x4_t *)v942; - int16x4_t v1144 = *(const int16x4_t *)v951; - int16x4_t v1146 = *(const int16x4_t *)v960; - int16x4_t v1148 = *(const int16x4_t *)v969; + int16x4_t v1120 = vld1_s16((const int16_t *)v843); + int16x4_t v1122 = vld1_s16((const int16_t *)v852); + int16x4_t v1124 = vld1_s16((const int16_t *)v861); + int16x4_t v1126 = vld1_s16((const int16_t *)v870); + int16x4_t v1128 = vld1_s16((const int16_t *)v879); + int16x4_t v1130 = vld1_s16((const int16_t *)v888); + int16x4_t v1132 = vld1_s16((const int16_t *)v897); + int16x4_t v1136 = vld1_s16((const int16_t *)v915); + int16x4_t v1138 = vld1_s16((const int16_t *)v924); + int16x4_t v1140 = vld1_s16((const int16_t *)v933); + int16x4_t v1142 = vld1_s16((const int16_t *)v942); + int16x4_t v1144 = vld1_s16((const int16_t *)v951); + int16x4_t v1146 = vld1_s16((const int16_t *)v960); + int16x4_t v1148 = vld1_s16((const int16_t *)v969); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1120), 15); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v1122), 15); float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v1124), 15); @@ -4527,8 +4527,8 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu16(const armral_cmplx_int16_t *restrict x, float32x4_t v320 = vsubq_f32(v302, v292); float32x4_t v321 = vsubq_f32(v292, v297); float32x4_t v322 = vsubq_f32(v292, v302); - *(float32x4_t *)v979 = v177; - *(float32x4_t *)v1051 = v178; + vst1q_f32((float32_t *)v979, v177); + vst1q_f32((float32_t *)v1051, v178); float32x4_t v303 = vaddq_f32(v174, v211); float32x4_t v304 = vsubq_f32(v174, v211); float32x4_t v306 = vaddq_f32(v224, v232); @@ -4553,8 +4553,8 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu16(const armral_cmplx_int16_t *restrict x, float32x4_t v336 = vsubq_f32(v318, v320); float32x4_t v337 = vaddq_f32(v318, v316); float32x4_t v338 = vsubq_f32(v318, v316); - *(float32x4_t *)v1015 = v304; - *(float32x4_t *)v1087 = v303; + vst1q_f32((float32_t *)v1015, v304); + vst1q_f32((float32_t *)v1087, v303); float32x4_t v339 = vaddq_f32(v323, v333); float32x4_t v340 = vaddq_f32(v324, v334); float32x4_t v341 = vsubq_f32(v325, v334); @@ -4563,18 +4563,18 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu16(const armral_cmplx_int16_t *restrict x, float32x4_t v344 = vaddq_f32(v328, v336); float32x4_t v345 = vsubq_f32(v329, v338); float32x4_t v346 = vsubq_f32(v330, v337); - *(float32x4_t *)v997 = v312; - *(float32x4_t *)v1033 = v311; - *(float32x4_t *)v1069 = v310; - *(float32x4_t *)v1105 = v309; - *(float32x4_t *)v988 = v342; - *(float32x4_t *)v1006 = v345; - *(float32x4_t *)v1024 = v346; - *(float32x4_t *)v1042 = v341; - *(float32x4_t *)v1060 = v340; - *(float32x4_t *)v1078 = v343; - *(float32x4_t *)v1096 = v344; - *(float32x4_t *)v1114 = v339; + vst1q_f32((float32_t *)v997, v312); + vst1q_f32((float32_t *)v1033, v311); + vst1q_f32((float32_t *)v1069, v310); + vst1q_f32((float32_t *)v1105, v309); + vst1q_f32((float32_t *)v988, v342); + vst1q_f32((float32_t *)v1006, v345); + vst1q_f32((float32_t *)v1024, v346); + vst1q_f32((float32_t *)v1042, v341); + vst1q_f32((float32_t *)v1060, v340); + vst1q_f32((float32_t *)v1078, v343); + vst1q_f32((float32_t *)v1096, v344); + vst1q_f32((float32_t *)v1114, v339); v5 += 2 * 1; v6 += 2 * 1; } @@ -5252,7 +5252,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu17(const armral_cmplx_int16_t *restrict x, float32x2_t v461 = (float32x2_t){v459, v460}; const int32_t *v1390 = &v5[0]; float32x2_t *v1400 = &v6[0]; - int16x4_t v1548 = *(const int16x4_t *)v1245; + int16x4_t v1548 = vld1_s16((const int16_t *)v1245); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1548), 15); float32x4_t v232 = vcombine_f32(v231, v231); float32x4_t v237 = vcombine_f32(v236, v236); @@ -5319,7 +5319,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu17(const armral_cmplx_int16_t *restrict x, float32x2_t *v1526 = &v6[ostride * 10]; float32x2_t *v1535 = &v6[ostride * 8]; float32x2_t *v1544 = &v6[ostride * 9]; - int16x4_t v1580 = *(const int16x4_t *)v1390; + int16x4_t v1580 = vld1_s16((const int16_t *)v1390); float32x4_t v222 = vcvtq_n_f32_s32(vmovl_s16(v1580), 15); float32x4_t v305 = vcombine_f32(v303, v303); float32x4_t v313 = vcombine_f32(v311, v311); @@ -5342,21 +5342,21 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu17(const armral_cmplx_int16_t *restrict x, float32x4_t v449 = vcombine_f32(v447, v447); float32x4_t v457 = vcombine_f32(v455, v455); float32x4_t v465 = vcombine_f32(v463, v463); - int16x4_t v1550 = *(const int16x4_t *)v1254; - int16x4_t v1552 = *(const int16x4_t *)v1263; - int16x4_t v1554 = *(const int16x4_t *)v1272; - int16x4_t v1556 = *(const int16x4_t *)v1281; - int16x4_t v1558 = *(const int16x4_t *)v1290; - int16x4_t v1560 = *(const int16x4_t *)v1299; - int16x4_t v1562 = *(const int16x4_t *)v1308; - int16x4_t v1564 = *(const int16x4_t *)v1317; - int16x4_t v1566 = *(const int16x4_t *)v1326; - int16x4_t v1568 = *(const int16x4_t *)v1335; - int16x4_t v1570 = *(const int16x4_t *)v1344; - int16x4_t v1572 = *(const int16x4_t *)v1353; - int16x4_t v1574 = *(const int16x4_t *)v1362; - int16x4_t v1576 = *(const int16x4_t *)v1371; - int16x4_t v1578 = *(const int16x4_t *)v1380; + int16x4_t v1550 = vld1_s16((const int16_t *)v1254); + int16x4_t v1552 = vld1_s16((const int16_t *)v1263); + int16x4_t v1554 = vld1_s16((const int16_t *)v1272); + int16x4_t v1556 = vld1_s16((const int16_t *)v1281); + int16x4_t v1558 = vld1_s16((const int16_t *)v1290); + int16x4_t v1560 = vld1_s16((const int16_t *)v1299); + int16x4_t v1562 = vld1_s16((const int16_t *)v1308); + int16x4_t v1564 = vld1_s16((const int16_t *)v1317); + int16x4_t v1566 = vld1_s16((const int16_t *)v1326); + int16x4_t v1568 = vld1_s16((const int16_t *)v1335); + int16x4_t v1570 = vld1_s16((const int16_t *)v1344); + int16x4_t v1572 = vld1_s16((const int16_t *)v1353); + int16x4_t v1574 = vld1_s16((const int16_t *)v1362); + int16x4_t v1576 = vld1_s16((const int16_t *)v1371); + int16x4_t v1578 = vld1_s16((const int16_t *)v1380); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1550), 15); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v1552), 15); float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v1554), 15); @@ -5496,7 +5496,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu17(const armral_cmplx_int16_t *restrict x, float32x4_t v473 = vsubq_f32(v298, v258); float32x4_t v474 = vaddq_f32(v298, v253); float32x4_t v475 = vaddq_f32(v263, v223); - *(float32x4_t *)v1400 = v223; + vst1q_f32((float32_t *)v1400, v223); float32x4_t v213 = vsubq_f32(v212, v164); float32x4_t v448 = vrev64q_f32(v209); float32x4_t v476 = vaddq_f32(v268, v475); @@ -5565,8 +5565,8 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu17(const armral_cmplx_int16_t *restrict x, float32x4_t v521 = vaddq_f32(v513, v519); float32x4_t v527 = vaddq_f32(v526, v519); float32x4_t v538 = vaddq_f32(v537, v519); - *(float32x4_t *)v1445 = v580; - *(float32x4_t *)v1454 = v588; + vst1q_f32((float32_t *)v1445, v580); + vst1q_f32((float32_t *)v1454, v588); float32x4_t v523 = vaddq_f32(v522, v514); float32x4_t v525 = vaddq_f32(v524, v517); float32x4_t v529 = vsubq_f32(v528, v521); @@ -5584,24 +5584,24 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu17(const armral_cmplx_int16_t *restrict x, float32x4_t v620 = vsubq_f32(v489, v529); float32x4_t v628 = vaddq_f32(v487, v525); float32x4_t v636 = vsubq_f32(v487, v525); - *(float32x4_t *)v1427 = v564; - *(float32x4_t *)v1436 = v572; - *(float32x4_t *)v1535 = v660; - *(float32x4_t *)v1544 = v668; + vst1q_f32((float32_t *)v1427, v564); + vst1q_f32((float32_t *)v1436, v572); + vst1q_f32((float32_t *)v1535, v660); + vst1q_f32((float32_t *)v1544, v668); float32x4_t v596 = vaddq_f32(v490, v532); float32x4_t v604 = vsubq_f32(v490, v532); float32x4_t v644 = vaddq_f32(v491, v535); float32x4_t v652 = vsubq_f32(v491, v535); - *(float32x4_t *)v1409 = v548; - *(float32x4_t *)v1418 = v556; - *(float32x4_t *)v1481 = v612; - *(float32x4_t *)v1490 = v620; - *(float32x4_t *)v1499 = v628; - *(float32x4_t *)v1508 = v636; - *(float32x4_t *)v1463 = v596; - *(float32x4_t *)v1472 = v604; - *(float32x4_t *)v1517 = v644; - *(float32x4_t *)v1526 = v652; + vst1q_f32((float32_t *)v1409, v548); + vst1q_f32((float32_t *)v1418, v556); + vst1q_f32((float32_t *)v1481, v612); + vst1q_f32((float32_t *)v1490, v620); + vst1q_f32((float32_t *)v1499, v628); + vst1q_f32((float32_t *)v1508, v636); + vst1q_f32((float32_t *)v1463, v596); + vst1q_f32((float32_t *)v1472, v604); + vst1q_f32((float32_t *)v1517, v644); + vst1q_f32((float32_t *)v1526, v652); v5 += 2 * 1; v6 += 2 * 1; } @@ -6628,7 +6628,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu18(const armral_cmplx_int16_t *restrict x, float32x2_t v382 = (float32x2_t){v380, v381}; const int32_t *v982 = &v5[0]; float32x2_t *v1145 = &v6[0]; - int16x4_t v1324 = *(const int16x4_t *)v1081; + int16x4_t v1324 = vld1_s16((const int16_t *)v1081); float32x4_t v126 = vcvtq_n_f32_s32(vmovl_s16(v1324), 15); float32x4_t v326 = vcombine_f32(v325, v325); float32x4_t v339 = vcombine_f32(v338, v338); @@ -6671,28 +6671,28 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu18(const armral_cmplx_int16_t *restrict x, float32x2_t *v1280 = &v6[ostride * 7]; float32x2_t *v1289 = &v6[ostride * 8]; float32x2_t *v1298 = &v6[ostride * 17]; - int16x4_t v1302 = *(const int16x4_t *)v982; + int16x4_t v1302 = vld1_s16((const int16_t *)v982); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1302), 15); float32x4_t v347 = vcombine_f32(v345, v345); float32x4_t v370 = vcombine_f32(v368, v368); float32x4_t v378 = vcombine_f32(v376, v376); float32x4_t v386 = vcombine_f32(v384, v384); - int16x4_t v1304 = *(const int16x4_t *)v991; - int16x4_t v1306 = *(const int16x4_t *)v1000; - int16x4_t v1308 = *(const int16x4_t *)v1009; - int16x4_t v1310 = *(const int16x4_t *)v1018; - int16x4_t v1312 = *(const int16x4_t *)v1027; - int16x4_t v1314 = *(const int16x4_t *)v1036; - int16x4_t v1316 = *(const int16x4_t *)v1045; - int16x4_t v1318 = *(const int16x4_t *)v1054; - int16x4_t v1320 = *(const int16x4_t *)v1063; - int16x4_t v1322 = *(const int16x4_t *)v1072; - int16x4_t v1326 = *(const int16x4_t *)v1090; - int16x4_t v1328 = *(const int16x4_t *)v1099; - int16x4_t v1330 = *(const int16x4_t *)v1108; - int16x4_t v1332 = *(const int16x4_t *)v1117; - int16x4_t v1334 = *(const int16x4_t *)v1126; - int16x4_t v1336 = *(const int16x4_t *)v1135; + int16x4_t v1304 = vld1_s16((const int16_t *)v991); + int16x4_t v1306 = vld1_s16((const int16_t *)v1000); + int16x4_t v1308 = vld1_s16((const int16_t *)v1009); + int16x4_t v1310 = vld1_s16((const int16_t *)v1018); + int16x4_t v1312 = vld1_s16((const int16_t *)v1027); + int16x4_t v1314 = vld1_s16((const int16_t *)v1036); + int16x4_t v1316 = vld1_s16((const int16_t *)v1045); + int16x4_t v1318 = vld1_s16((const int16_t *)v1054); + int16x4_t v1320 = vld1_s16((const int16_t *)v1063); + int16x4_t v1322 = vld1_s16((const int16_t *)v1072); + int16x4_t v1326 = vld1_s16((const int16_t *)v1090); + int16x4_t v1328 = vld1_s16((const int16_t *)v1099); + int16x4_t v1330 = vld1_s16((const int16_t *)v1108); + int16x4_t v1332 = vld1_s16((const int16_t *)v1117); + int16x4_t v1334 = vld1_s16((const int16_t *)v1126); + int16x4_t v1336 = vld1_s16((const int16_t *)v1135); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1304), 15); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v1306), 15); float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v1308), 15); @@ -6815,8 +6815,8 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu18(const armral_cmplx_int16_t *restrict x, float32x4_t v402 = vaddq_f32(v401, v379); float32x4_t v404 = vaddq_f32(v403, v387); float32x4_t v406 = vsubq_f32(v405, v387); - *(float32x4_t *)v1145 = v194; - *(float32x4_t *)v1154 = v309; + vst1q_f32((float32_t *)v1145, v194); + vst1q_f32((float32_t *)v1154, v309); float32x4_t v275 = vaddq_f32(v194, v274); float32x4_t v279 = vaddq_f32(v278, v273); float32x4_t v390 = vaddq_f32(v309, v389); @@ -6837,10 +6837,10 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu18(const armral_cmplx_int16_t *restrict x, float32x4_t v396 = vaddq_f32(v395, v358); float32x4_t v398 = vaddq_f32(v397, v363); float32x4_t v400 = vsubq_f32(v399, v363); - *(float32x4_t *)v1199 = v277; - *(float32x4_t *)v1208 = v392; - *(float32x4_t *)v1253 = v276; - *(float32x4_t *)v1262 = v391; + vst1q_f32((float32_t *)v1199, v277); + vst1q_f32((float32_t *)v1208, v392); + vst1q_f32((float32_t *)v1253, v276); + vst1q_f32((float32_t *)v1262, v391); float32x4_t v292 = vaddq_f32(v281, v287); float32x4_t v293 = vsubq_f32(v281, v287); float32x4_t v294 = vaddq_f32(v283, v289); @@ -6853,18 +6853,18 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu18(const armral_cmplx_int16_t *restrict x, float32x4_t v410 = vsubq_f32(v398, v404); float32x4_t v411 = vaddq_f32(v400, v406); float32x4_t v412 = vsubq_f32(v400, v406); - *(float32x4_t *)v1163 = v293; - *(float32x4_t *)v1172 = v408; - *(float32x4_t *)v1181 = v294; - *(float32x4_t *)v1190 = v409; - *(float32x4_t *)v1217 = v297; - *(float32x4_t *)v1226 = v412; - *(float32x4_t *)v1235 = v296; - *(float32x4_t *)v1244 = v411; - *(float32x4_t *)v1271 = v295; - *(float32x4_t *)v1280 = v410; - *(float32x4_t *)v1289 = v292; - *(float32x4_t *)v1298 = v407; + vst1q_f32((float32_t *)v1163, v293); + vst1q_f32((float32_t *)v1172, v408); + vst1q_f32((float32_t *)v1181, v294); + vst1q_f32((float32_t *)v1190, v409); + vst1q_f32((float32_t *)v1217, v297); + vst1q_f32((float32_t *)v1226, v412); + vst1q_f32((float32_t *)v1235, v296); + vst1q_f32((float32_t *)v1244, v411); + vst1q_f32((float32_t *)v1271, v295); + vst1q_f32((float32_t *)v1280, v410); + vst1q_f32((float32_t *)v1289, v292); + vst1q_f32((float32_t *)v1298, v407); v5 += 2 * 1; v6 += 2 * 1; } @@ -7659,7 +7659,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu19(const armral_cmplx_int16_t *restrict x, float32x2_t v502 = (float32x2_t){v500, v501}; const int32_t *v1533 = &v5[0]; float32x2_t *v1543 = &v6[0]; - int16x4_t v1709 = *(const int16x4_t *)v1370; + int16x4_t v1709 = vld1_s16((const int16_t *)v1370); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1709), 15); float32x4_t v264 = vcombine_f32(v263, v263); float32x4_t v269 = vcombine_f32(v268, v268); @@ -7733,7 +7733,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu19(const armral_cmplx_int16_t *restrict x, float32x2_t *v1687 = &v6[ostride * 11]; float32x2_t *v1696 = &v6[ostride * 9]; float32x2_t *v1705 = &v6[ostride * 10]; - int16x4_t v1745 = *(const int16x4_t *)v1533; + int16x4_t v1745 = vld1_s16((const int16_t *)v1533); float32x4_t v206 = vcvtq_n_f32_s32(vmovl_s16(v1745), 15); float32x4_t v362 = vcombine_f32(v360, v360); float32x4_t v370 = vcombine_f32(v368, v368); @@ -7754,23 +7754,23 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu19(const armral_cmplx_int16_t *restrict x, float32x4_t v490 = vcombine_f32(v488, v488); float32x4_t v498 = vcombine_f32(v496, v496); float32x4_t v506 = vcombine_f32(v504, v504); - int16x4_t v1711 = *(const int16x4_t *)v1379; - int16x4_t v1713 = *(const int16x4_t *)v1388; - int16x4_t v1715 = *(const int16x4_t *)v1397; - int16x4_t v1717 = *(const int16x4_t *)v1406; - int16x4_t v1719 = *(const int16x4_t *)v1415; - int16x4_t v1721 = *(const int16x4_t *)v1424; - int16x4_t v1723 = *(const int16x4_t *)v1433; - int16x4_t v1725 = *(const int16x4_t *)v1442; - int16x4_t v1727 = *(const int16x4_t *)v1451; - int16x4_t v1729 = *(const int16x4_t *)v1460; - int16x4_t v1731 = *(const int16x4_t *)v1469; - int16x4_t v1733 = *(const int16x4_t *)v1478; - int16x4_t v1735 = *(const int16x4_t *)v1487; - int16x4_t v1737 = *(const int16x4_t *)v1496; - int16x4_t v1739 = *(const int16x4_t *)v1505; - int16x4_t v1741 = *(const int16x4_t *)v1514; - int16x4_t v1743 = *(const int16x4_t *)v1523; + int16x4_t v1711 = vld1_s16((const int16_t *)v1379); + int16x4_t v1713 = vld1_s16((const int16_t *)v1388); + int16x4_t v1715 = vld1_s16((const int16_t *)v1397); + int16x4_t v1717 = vld1_s16((const int16_t *)v1406); + int16x4_t v1719 = vld1_s16((const int16_t *)v1415); + int16x4_t v1721 = vld1_s16((const int16_t *)v1424); + int16x4_t v1723 = vld1_s16((const int16_t *)v1433); + int16x4_t v1725 = vld1_s16((const int16_t *)v1442); + int16x4_t v1727 = vld1_s16((const int16_t *)v1451); + int16x4_t v1729 = vld1_s16((const int16_t *)v1460); + int16x4_t v1731 = vld1_s16((const int16_t *)v1469); + int16x4_t v1733 = vld1_s16((const int16_t *)v1478); + int16x4_t v1735 = vld1_s16((const int16_t *)v1487); + int16x4_t v1737 = vld1_s16((const int16_t *)v1496); + int16x4_t v1739 = vld1_s16((const int16_t *)v1505); + int16x4_t v1741 = vld1_s16((const int16_t *)v1514); + int16x4_t v1743 = vld1_s16((const int16_t *)v1523); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1711), 15); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v1713), 15); float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v1715), 15); @@ -7936,7 +7936,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu19(const armral_cmplx_int16_t *restrict x, float32x4_t v534 = vaddq_f32(v265, v207); float32x4_t v539 = vaddq_f32(v371, v379); float32x4_t v540 = vaddq_f32(v395, v403); - *(float32x4_t *)v1543 = v207; + vst1q_f32((float32_t *)v1543, v207); float32x4_t v483 = vmulq_f32(v481, v482); float32x4_t v510 = vaddq_f32(v335, v340); float32x4_t v514 = vaddq_f32(v330, v340); @@ -8020,34 +8020,34 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu19(const armral_cmplx_int16_t *restrict x, float32x4_t v672 = vsubq_f32(v574, v586); float32x4_t v712 = vsubq_f32(v578, v590); float32x4_t v720 = vaddq_f32(v578, v590); - *(float32x4_t *)v1570 = v616; - *(float32x4_t *)v1579 = v624; - *(float32x4_t *)v1588 = v632; - *(float32x4_t *)v1597 = v640; + vst1q_f32((float32_t *)v1570, v616); + vst1q_f32((float32_t *)v1579, v624); + vst1q_f32((float32_t *)v1588, v632); + vst1q_f32((float32_t *)v1597, v640); float32x4_t v582 = vaddq_f32(v581, v565); float32x4_t v584 = vaddq_f32(v583, v567); float32x4_t v680 = vaddq_f32(v576, v588); float32x4_t v688 = vsubq_f32(v576, v588); float32x4_t v696 = vaddq_f32(v575, v587); float32x4_t v704 = vsubq_f32(v575, v587); - *(float32x4_t *)v1606 = v648; - *(float32x4_t *)v1615 = v656; - *(float32x4_t *)v1624 = v664; - *(float32x4_t *)v1633 = v672; - *(float32x4_t *)v1678 = v712; - *(float32x4_t *)v1687 = v720; + vst1q_f32((float32_t *)v1606, v648); + vst1q_f32((float32_t *)v1615, v656); + vst1q_f32((float32_t *)v1624, v664); + vst1q_f32((float32_t *)v1633, v672); + vst1q_f32((float32_t *)v1678, v712); + vst1q_f32((float32_t *)v1687, v720); float32x4_t v600 = vaddq_f32(v570, v582); float32x4_t v608 = vsubq_f32(v570, v582); float32x4_t v728 = vaddq_f32(v572, v584); float32x4_t v736 = vsubq_f32(v572, v584); - *(float32x4_t *)v1642 = v680; - *(float32x4_t *)v1651 = v688; - *(float32x4_t *)v1660 = v696; - *(float32x4_t *)v1669 = v704; - *(float32x4_t *)v1552 = v600; - *(float32x4_t *)v1561 = v608; - *(float32x4_t *)v1696 = v728; - *(float32x4_t *)v1705 = v736; + vst1q_f32((float32_t *)v1642, v680); + vst1q_f32((float32_t *)v1651, v688); + vst1q_f32((float32_t *)v1660, v696); + vst1q_f32((float32_t *)v1669, v704); + vst1q_f32((float32_t *)v1552, v600); + vst1q_f32((float32_t *)v1561, v608); + vst1q_f32((float32_t *)v1696, v728); + vst1q_f32((float32_t *)v1705, v736); v5 += 2 * 1; v6 += 2 * 1; } @@ -9199,7 +9199,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu20(const armral_cmplx_int16_t *restrict x, float32x2_t v423 = (float32x2_t){v422, v422}; const int32_t *v1064 = &v5[0]; float32x2_t *v1245 = &v6[0]; - int16x4_t v1456 = *(const int16x4_t *)v1226; + int16x4_t v1456 = vld1_s16((const int16_t *)v1226); float32x4_t v198 = vcvtq_n_f32_s32(vmovl_s16(v1456), 15); float32x4_t v339 = vcombine_f32(v338, v338); float32x4_t v344 = vcombine_f32(v343, v343); @@ -9248,7 +9248,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu20(const armral_cmplx_int16_t *restrict x, float32x2_t *v1398 = &v6[ostride * 9]; float32x2_t *v1407 = &v6[ostride * 14]; float32x2_t *v1416 = &v6[ostride * 19]; - int16x4_t v1420 = *(const int16x4_t *)v1064; + int16x4_t v1420 = vld1_s16((const int16_t *)v1064); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1420), 15); float32x4_t v352 = vcombine_f32(v350, v350); float32x4_t v360 = vcombine_f32(v358, v358); @@ -9256,24 +9256,24 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu20(const armral_cmplx_int16_t *restrict x, float32x4_t v393 = vcombine_f32(v391, v391); float32x4_t v401 = vcombine_f32(v399, v399); float32x4_t v409 = vcombine_f32(v407, v407); - int16x4_t v1422 = *(const int16x4_t *)v1073; - int16x4_t v1424 = *(const int16x4_t *)v1082; - int16x4_t v1426 = *(const int16x4_t *)v1091; - int16x4_t v1428 = *(const int16x4_t *)v1100; - int16x4_t v1430 = *(const int16x4_t *)v1109; - int16x4_t v1432 = *(const int16x4_t *)v1118; - int16x4_t v1434 = *(const int16x4_t *)v1127; - int16x4_t v1436 = *(const int16x4_t *)v1136; - int16x4_t v1438 = *(const int16x4_t *)v1145; - int16x4_t v1440 = *(const int16x4_t *)v1154; - int16x4_t v1442 = *(const int16x4_t *)v1163; - int16x4_t v1444 = *(const int16x4_t *)v1172; - int16x4_t v1446 = *(const int16x4_t *)v1181; - int16x4_t v1448 = *(const int16x4_t *)v1190; - int16x4_t v1450 = *(const int16x4_t *)v1199; - int16x4_t v1452 = *(const int16x4_t *)v1208; - int16x4_t v1454 = *(const int16x4_t *)v1217; - int16x4_t v1458 = *(const int16x4_t *)v1235; + int16x4_t v1422 = vld1_s16((const int16_t *)v1073); + int16x4_t v1424 = vld1_s16((const int16_t *)v1082); + int16x4_t v1426 = vld1_s16((const int16_t *)v1091); + int16x4_t v1428 = vld1_s16((const int16_t *)v1100); + int16x4_t v1430 = vld1_s16((const int16_t *)v1109); + int16x4_t v1432 = vld1_s16((const int16_t *)v1118); + int16x4_t v1434 = vld1_s16((const int16_t *)v1127); + int16x4_t v1436 = vld1_s16((const int16_t *)v1136); + int16x4_t v1438 = vld1_s16((const int16_t *)v1145); + int16x4_t v1440 = vld1_s16((const int16_t *)v1154); + int16x4_t v1442 = vld1_s16((const int16_t *)v1163); + int16x4_t v1444 = vld1_s16((const int16_t *)v1172); + int16x4_t v1446 = vld1_s16((const int16_t *)v1181); + int16x4_t v1448 = vld1_s16((const int16_t *)v1190); + int16x4_t v1450 = vld1_s16((const int16_t *)v1199); + int16x4_t v1452 = vld1_s16((const int16_t *)v1208); + int16x4_t v1454 = vld1_s16((const int16_t *)v1217); + int16x4_t v1458 = vld1_s16((const int16_t *)v1235); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1422), 15); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v1424), 15); float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v1426), 15); @@ -9396,8 +9396,8 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu20(const armral_cmplx_int16_t *restrict x, float32x4_t v373 = vsubq_f32(v353, v361); float32x4_t v374 = vaddq_f32(v361, v369); float32x4_t v394 = vmulq_f32(v392, v393); - *(float32x4_t *)v1245 = v218; - *(float32x4_t *)v1263 = v274; + vst1q_f32((float32_t *)v1245, v218); + vst1q_f32((float32_t *)v1263, v274); float32x4_t v259 = vaddq_f32(v258, v233); float32x4_t v260 = vsubq_f32(v258, v233); float32x4_t v261 = vsubq_f32(v241, v249); @@ -9423,20 +9423,20 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu20(const armral_cmplx_int16_t *restrict x, float32x4_t v322 = vsubq_f32(v316, v318); float32x4_t v427 = vaddq_f32(v426, v410); float32x4_t v428 = vsubq_f32(v426, v410); - *(float32x4_t *)v1254 = v436; - *(float32x4_t *)v1272 = v435; + vst1q_f32((float32_t *)v1254, v436); + vst1q_f32((float32_t *)v1272, v435); float32x4_t v431 = vaddq_f32(v427, v429); float32x4_t v432 = vsubq_f32(v427, v429); float32x4_t v433 = vaddq_f32(v428, v430); float32x4_t v434 = vsubq_f32(v428, v430); - *(float32x4_t *)v1281 = v264; - *(float32x4_t *)v1299 = v320; - *(float32x4_t *)v1317 = v266; - *(float32x4_t *)v1335 = v322; - *(float32x4_t *)v1353 = v265; - *(float32x4_t *)v1371 = v321; - *(float32x4_t *)v1389 = v263; - *(float32x4_t *)v1407 = v319; + vst1q_f32((float32_t *)v1281, v264); + vst1q_f32((float32_t *)v1299, v320); + vst1q_f32((float32_t *)v1317, v266); + vst1q_f32((float32_t *)v1335, v322); + vst1q_f32((float32_t *)v1353, v265); + vst1q_f32((float32_t *)v1371, v321); + vst1q_f32((float32_t *)v1389, v263); + vst1q_f32((float32_t *)v1407, v319); float32x4_t v465 = vaddq_f32(v376, v432); float32x4_t v466 = vsubq_f32(v376, v432); float32x4_t v495 = vaddq_f32(v378, v434); @@ -9445,14 +9445,14 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu20(const armral_cmplx_int16_t *restrict x, float32x4_t v526 = vsubq_f32(v377, v433); float32x4_t v555 = vaddq_f32(v375, v431); float32x4_t v556 = vsubq_f32(v375, v431); - *(float32x4_t *)v1290 = v466; - *(float32x4_t *)v1308 = v465; - *(float32x4_t *)v1326 = v496; - *(float32x4_t *)v1344 = v495; - *(float32x4_t *)v1362 = v526; - *(float32x4_t *)v1380 = v525; - *(float32x4_t *)v1398 = v556; - *(float32x4_t *)v1416 = v555; + vst1q_f32((float32_t *)v1290, v466); + vst1q_f32((float32_t *)v1308, v465); + vst1q_f32((float32_t *)v1326, v496); + vst1q_f32((float32_t *)v1344, v495); + vst1q_f32((float32_t *)v1362, v526); + vst1q_f32((float32_t *)v1380, v525); + vst1q_f32((float32_t *)v1398, v556); + vst1q_f32((float32_t *)v1416, v555); v5 += 2 * 1; v6 += 2 * 1; } @@ -10247,7 +10247,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu21(const armral_cmplx_int16_t *restrict x, float32x2_t v470 = (float32x2_t){v469, v469}; const int32_t *v1225 = &v5[0]; float32x2_t *v1397 = &v6[0]; - int16x4_t v1611 = *(const int16x4_t *)v1342; + int16x4_t v1611 = vld1_s16((const int16_t *)v1342); float32x4_t v163 = vcvtq_n_f32_s32(vmovl_s16(v1611), 15); float32x4_t v235 = vcombine_f32(v234, v234); float32x4_t v240 = vcombine_f32(v239, v239); @@ -10313,7 +10313,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu21(const armral_cmplx_int16_t *restrict x, float32x2_t *v1559 = &v6[ostride * 6]; float32x2_t *v1568 = &v6[ostride * 13]; float32x2_t *v1577 = &v6[ostride * 20]; - int16x4_t v1585 = *(const int16x4_t *)v1225; + int16x4_t v1585 = vld1_s16((const int16_t *)v1225); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v1585), 15); float32x4_t v258 = vcombine_f32(v256, v256); float32x4_t v266 = vcombine_f32(v264, v264); @@ -10328,25 +10328,25 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu21(const armral_cmplx_int16_t *restrict x, float32x4_t v435 = vcombine_f32(v433, v433); float32x4_t v443 = vcombine_f32(v441, v441); float32x4_t v451 = vcombine_f32(v449, v449); - int16x4_t v1581 = *(const int16x4_t *)v1206; - int16x4_t v1583 = *(const int16x4_t *)v1215; - int16x4_t v1587 = *(const int16x4_t *)v1234; - int16x4_t v1589 = *(const int16x4_t *)v1243; - int16x4_t v1591 = *(const int16x4_t *)v1252; - int16x4_t v1593 = *(const int16x4_t *)v1261; - int16x4_t v1595 = *(const int16x4_t *)v1270; - int16x4_t v1597 = *(const int16x4_t *)v1279; - int16x4_t v1599 = *(const int16x4_t *)v1288; - int16x4_t v1601 = *(const int16x4_t *)v1297; - int16x4_t v1603 = *(const int16x4_t *)v1306; - int16x4_t v1605 = *(const int16x4_t *)v1315; - int16x4_t v1607 = *(const int16x4_t *)v1324; - int16x4_t v1609 = *(const int16x4_t *)v1333; - int16x4_t v1613 = *(const int16x4_t *)v1351; - int16x4_t v1615 = *(const int16x4_t *)v1360; - int16x4_t v1617 = *(const int16x4_t *)v1369; - int16x4_t v1619 = *(const int16x4_t *)v1378; - int16x4_t v1621 = *(const int16x4_t *)v1387; + int16x4_t v1581 = vld1_s16((const int16_t *)v1206); + int16x4_t v1583 = vld1_s16((const int16_t *)v1215); + int16x4_t v1587 = vld1_s16((const int16_t *)v1234); + int16x4_t v1589 = vld1_s16((const int16_t *)v1243); + int16x4_t v1591 = vld1_s16((const int16_t *)v1252); + int16x4_t v1593 = vld1_s16((const int16_t *)v1261); + int16x4_t v1595 = vld1_s16((const int16_t *)v1270); + int16x4_t v1597 = vld1_s16((const int16_t *)v1279); + int16x4_t v1599 = vld1_s16((const int16_t *)v1288); + int16x4_t v1601 = vld1_s16((const int16_t *)v1297); + int16x4_t v1603 = vld1_s16((const int16_t *)v1306); + int16x4_t v1605 = vld1_s16((const int16_t *)v1315); + int16x4_t v1607 = vld1_s16((const int16_t *)v1324); + int16x4_t v1609 = vld1_s16((const int16_t *)v1333); + int16x4_t v1613 = vld1_s16((const int16_t *)v1351); + int16x4_t v1615 = vld1_s16((const int16_t *)v1360); + int16x4_t v1617 = vld1_s16((const int16_t *)v1369); + int16x4_t v1619 = vld1_s16((const int16_t *)v1378); + int16x4_t v1621 = vld1_s16((const int16_t *)v1387); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1581), 15); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1583), 15); float32x4_t v55 = vcvtq_n_f32_s32(vmovl_s16(v1587), 15); @@ -10489,7 +10489,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu21(const armral_cmplx_int16_t *restrict x, float32x4_t v483 = vsubq_f32(v482, v472); float32x4_t v485 = vaddq_f32(v484, v472); float32x4_t v492 = vaddq_f32(v218, v324); - *(float32x4_t *)v1397 = v218; + vst1q_f32((float32_t *)v1397, v218); float32x4_t v285 = vaddq_f32(v284, v241); float32x4_t v287 = vsubq_f32(v284, v241); float32x4_t v289 = vsubq_f32(v284, v246); @@ -10517,8 +10517,8 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu21(const armral_cmplx_int16_t *restrict x, float32x4_t v474 = vaddq_f32(v473, v436); float32x4_t v476 = vsubq_f32(v473, v436); float32x4_t v478 = vsubq_f32(v473, v444); - *(float32x4_t *)v1406 = v494; - *(float32x4_t *)v1415 = v493; + vst1q_f32((float32_t *)v1406, v494); + vst1q_f32((float32_t *)v1415, v493); float32x4_t v297 = vaddq_f32(v286, v292); float32x4_t v298 = vsubq_f32(v286, v292); float32x4_t v299 = vaddq_f32(v288, v294); @@ -10546,12 +10546,12 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu21(const armral_cmplx_int16_t *restrict x, float32x4_t v588 = vaddq_f32(v302, v395); float32x4_t v612 = vaddq_f32(v299, v392); float32x4_t v636 = vaddq_f32(v297, v390); - *(float32x4_t *)v1424 = v298; - *(float32x4_t *)v1451 = v300; - *(float32x4_t *)v1478 = v301; - *(float32x4_t *)v1505 = v302; - *(float32x4_t *)v1532 = v299; - *(float32x4_t *)v1559 = v297; + vst1q_f32((float32_t *)v1424, v298); + vst1q_f32((float32_t *)v1451, v300); + vst1q_f32((float32_t *)v1478, v301); + vst1q_f32((float32_t *)v1505, v302); + vst1q_f32((float32_t *)v1532, v299); + vst1q_f32((float32_t *)v1559, v297); float32x4_t v517 = vaddq_f32(v516, v487); float32x4_t v518 = vsubq_f32(v516, v487); float32x4_t v541 = vaddq_f32(v540, v489); @@ -10564,18 +10564,18 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu21(const armral_cmplx_int16_t *restrict x, float32x4_t v614 = vsubq_f32(v612, v488); float32x4_t v637 = vaddq_f32(v636, v486); float32x4_t v638 = vsubq_f32(v636, v486); - *(float32x4_t *)v1433 = v518; - *(float32x4_t *)v1442 = v517; - *(float32x4_t *)v1460 = v542; - *(float32x4_t *)v1469 = v541; - *(float32x4_t *)v1487 = v566; - *(float32x4_t *)v1496 = v565; - *(float32x4_t *)v1514 = v590; - *(float32x4_t *)v1523 = v589; - *(float32x4_t *)v1541 = v614; - *(float32x4_t *)v1550 = v613; - *(float32x4_t *)v1568 = v638; - *(float32x4_t *)v1577 = v637; + vst1q_f32((float32_t *)v1433, v518); + vst1q_f32((float32_t *)v1442, v517); + vst1q_f32((float32_t *)v1460, v542); + vst1q_f32((float32_t *)v1469, v541); + vst1q_f32((float32_t *)v1487, v566); + vst1q_f32((float32_t *)v1496, v565); + vst1q_f32((float32_t *)v1514, v590); + vst1q_f32((float32_t *)v1523, v589); + vst1q_f32((float32_t *)v1541, v614); + vst1q_f32((float32_t *)v1550, v613); + vst1q_f32((float32_t *)v1568, v638); + vst1q_f32((float32_t *)v1577, v637); v5 += 2 * 1; v6 += 2 * 1; } @@ -11596,7 +11596,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu22(const armral_cmplx_int16_t *restrict x, float32x2_t v604 = (float32x2_t){v602, v603}; const int32_t *v1490 = &v5[0]; float32x2_t *v1689 = &v6[0]; - int16x4_t v1908 = *(const int16x4_t *)v1607; + int16x4_t v1908 = vld1_s16((const int16_t *)v1607); float32x4_t v144 = vcvtq_n_f32_s32(vmovl_s16(v1908), 15); float32x4_t v483 = vcombine_f32(v482, v482); float32x2_t v489 = vmul_f32(v605, v487); @@ -11658,7 +11658,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu22(const armral_cmplx_int16_t *restrict x, float32x2_t *v1860 = &v6[ostride * 9]; float32x2_t *v1869 = &v6[ostride * 10]; float32x2_t *v1878 = &v6[ostride * 21]; - int16x4_t v1882 = *(const int16x4_t *)v1490; + int16x4_t v1882 = vld1_s16((const int16_t *)v1490); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1882), 15); float32x4_t v491 = vcombine_f32(v489, v489); float32x4_t v544 = vcombine_f32(v542, v542); @@ -11670,26 +11670,26 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu22(const armral_cmplx_int16_t *restrict x, float32x4_t v592 = vcombine_f32(v590, v590); float32x4_t v600 = vcombine_f32(v598, v598); float32x4_t v608 = vcombine_f32(v606, v606); - int16x4_t v1884 = *(const int16x4_t *)v1499; - int16x4_t v1886 = *(const int16x4_t *)v1508; - int16x4_t v1888 = *(const int16x4_t *)v1517; - int16x4_t v1890 = *(const int16x4_t *)v1526; - int16x4_t v1892 = *(const int16x4_t *)v1535; - int16x4_t v1894 = *(const int16x4_t *)v1544; - int16x4_t v1896 = *(const int16x4_t *)v1553; - int16x4_t v1898 = *(const int16x4_t *)v1562; - int16x4_t v1900 = *(const int16x4_t *)v1571; - int16x4_t v1902 = *(const int16x4_t *)v1580; - int16x4_t v1904 = *(const int16x4_t *)v1589; - int16x4_t v1906 = *(const int16x4_t *)v1598; - int16x4_t v1910 = *(const int16x4_t *)v1616; - int16x4_t v1912 = *(const int16x4_t *)v1625; - int16x4_t v1914 = *(const int16x4_t *)v1634; - int16x4_t v1916 = *(const int16x4_t *)v1643; - int16x4_t v1918 = *(const int16x4_t *)v1652; - int16x4_t v1920 = *(const int16x4_t *)v1661; - int16x4_t v1922 = *(const int16x4_t *)v1670; - int16x4_t v1924 = *(const int16x4_t *)v1679; + int16x4_t v1884 = vld1_s16((const int16_t *)v1499); + int16x4_t v1886 = vld1_s16((const int16_t *)v1508); + int16x4_t v1888 = vld1_s16((const int16_t *)v1517); + int16x4_t v1890 = vld1_s16((const int16_t *)v1526); + int16x4_t v1892 = vld1_s16((const int16_t *)v1535); + int16x4_t v1894 = vld1_s16((const int16_t *)v1544); + int16x4_t v1896 = vld1_s16((const int16_t *)v1553); + int16x4_t v1898 = vld1_s16((const int16_t *)v1562); + int16x4_t v1900 = vld1_s16((const int16_t *)v1571); + int16x4_t v1902 = vld1_s16((const int16_t *)v1580); + int16x4_t v1904 = vld1_s16((const int16_t *)v1589); + int16x4_t v1906 = vld1_s16((const int16_t *)v1598); + int16x4_t v1910 = vld1_s16((const int16_t *)v1616); + int16x4_t v1912 = vld1_s16((const int16_t *)v1625); + int16x4_t v1914 = vld1_s16((const int16_t *)v1634); + int16x4_t v1916 = vld1_s16((const int16_t *)v1643); + int16x4_t v1918 = vld1_s16((const int16_t *)v1652); + int16x4_t v1920 = vld1_s16((const int16_t *)v1661); + int16x4_t v1922 = vld1_s16((const int16_t *)v1670); + int16x4_t v1924 = vld1_s16((const int16_t *)v1679); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1884), 15); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v1886), 15); float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v1888), 15); @@ -11900,8 +11900,8 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu22(const armral_cmplx_int16_t *restrict x, float32x4_t v624 = vsubq_f32(v593, v609); float32x4_t v625 = vaddq_f32(v615, v617); float32x4_t v643 = vaddq_f32(v619, v620); - *(float32x4_t *)v1689 = v235; - *(float32x4_t *)v1698 = v454; + vst1q_f32((float32_t *)v1689, v235); + vst1q_f32((float32_t *)v1698, v454); float32x4_t v407 = vaddq_f32(v406, v391); float32x4_t v408 = vsubq_f32(v391, v393); float32x4_t v410 = vaddq_f32(v391, v397); @@ -11962,26 +11962,26 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu22(const armral_cmplx_int16_t *restrict x, float32x4_t v437 = vsubq_f32(v415, v427); float32x4_t v647 = vaddq_f32(v634, v646); float32x4_t v656 = vsubq_f32(v634, v646); - *(float32x4_t *)v1725 = v436; - *(float32x4_t *)v1734 = v655; - *(float32x4_t *)v1743 = v435; - *(float32x4_t *)v1752 = v654; - *(float32x4_t *)v1761 = v434; - *(float32x4_t *)v1770 = v653; - *(float32x4_t *)v1779 = v433; - *(float32x4_t *)v1788 = v652; - *(float32x4_t *)v1797 = v432; - *(float32x4_t *)v1806 = v651; - *(float32x4_t *)v1815 = v431; - *(float32x4_t *)v1824 = v650; - *(float32x4_t *)v1833 = v430; - *(float32x4_t *)v1842 = v649; - *(float32x4_t *)v1851 = v429; - *(float32x4_t *)v1860 = v648; - *(float32x4_t *)v1707 = v437; - *(float32x4_t *)v1716 = v656; - *(float32x4_t *)v1869 = v428; - *(float32x4_t *)v1878 = v647; + vst1q_f32((float32_t *)v1725, v436); + vst1q_f32((float32_t *)v1734, v655); + vst1q_f32((float32_t *)v1743, v435); + vst1q_f32((float32_t *)v1752, v654); + vst1q_f32((float32_t *)v1761, v434); + vst1q_f32((float32_t *)v1770, v653); + vst1q_f32((float32_t *)v1779, v433); + vst1q_f32((float32_t *)v1788, v652); + vst1q_f32((float32_t *)v1797, v432); + vst1q_f32((float32_t *)v1806, v651); + vst1q_f32((float32_t *)v1815, v431); + vst1q_f32((float32_t *)v1824, v650); + vst1q_f32((float32_t *)v1833, v430); + vst1q_f32((float32_t *)v1842, v649); + vst1q_f32((float32_t *)v1851, v429); + vst1q_f32((float32_t *)v1860, v648); + vst1q_f32((float32_t *)v1707, v437); + vst1q_f32((float32_t *)v1716, v656); + vst1q_f32((float32_t *)v1869, v428); + vst1q_f32((float32_t *)v1878, v647); v5 += 2 * 1; v6 += 2 * 1; } @@ -13114,7 +13114,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu24(const armral_cmplx_int16_t *restrict x, float32x2_t v452 = (float32x2_t){v450, v451}; const int32_t *v1216 = &v5[0]; float32x2_t *v1415 = &v6[0]; - int16x4_t v1646 = *(const int16x4_t *)v1288; + int16x4_t v1646 = vld1_s16((const int16_t *)v1288); float32x4_t v117 = vcvtq_n_f32_s32(vmovl_s16(v1646), 15); float32x2_t v285 = vmul_f32(v453, v283); float32x2_t v293 = vmul_f32(v453, v291); @@ -13171,7 +13171,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu24(const armral_cmplx_int16_t *restrict x, float32x2_t *v1604 = &v6[ostride * 15]; float32x2_t *v1613 = &v6[ostride * 7]; float32x2_t *v1622 = &v6[ostride * 23]; - int16x4_t v1630 = *(const int16x4_t *)v1216; + int16x4_t v1630 = vld1_s16((const int16_t *)v1216); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v1630), 15); float32x4_t v287 = vcombine_f32(v285, v285); float32x4_t v295 = vcombine_f32(v293, v293); @@ -13179,28 +13179,28 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu24(const armral_cmplx_int16_t *restrict x, float32x4_t v370 = vcombine_f32(v368, v368); float32x4_t v438 = vcombine_f32(v436, v436); float32x4_t v456 = vcombine_f32(v454, v454); - int16x4_t v1626 = *(const int16x4_t *)v1197; - int16x4_t v1628 = *(const int16x4_t *)v1206; - int16x4_t v1632 = *(const int16x4_t *)v1225; - int16x4_t v1634 = *(const int16x4_t *)v1234; - int16x4_t v1636 = *(const int16x4_t *)v1243; - int16x4_t v1638 = *(const int16x4_t *)v1252; - int16x4_t v1640 = *(const int16x4_t *)v1261; - int16x4_t v1642 = *(const int16x4_t *)v1270; - int16x4_t v1644 = *(const int16x4_t *)v1279; - int16x4_t v1648 = *(const int16x4_t *)v1297; - int16x4_t v1650 = *(const int16x4_t *)v1306; - int16x4_t v1652 = *(const int16x4_t *)v1315; - int16x4_t v1654 = *(const int16x4_t *)v1324; - int16x4_t v1656 = *(const int16x4_t *)v1333; - int16x4_t v1658 = *(const int16x4_t *)v1342; - int16x4_t v1660 = *(const int16x4_t *)v1351; - int16x4_t v1662 = *(const int16x4_t *)v1360; - int16x4_t v1664 = *(const int16x4_t *)v1369; - int16x4_t v1666 = *(const int16x4_t *)v1378; - int16x4_t v1668 = *(const int16x4_t *)v1387; - int16x4_t v1670 = *(const int16x4_t *)v1396; - int16x4_t v1672 = *(const int16x4_t *)v1405; + int16x4_t v1626 = vld1_s16((const int16_t *)v1197); + int16x4_t v1628 = vld1_s16((const int16_t *)v1206); + int16x4_t v1632 = vld1_s16((const int16_t *)v1225); + int16x4_t v1634 = vld1_s16((const int16_t *)v1234); + int16x4_t v1636 = vld1_s16((const int16_t *)v1243); + int16x4_t v1638 = vld1_s16((const int16_t *)v1252); + int16x4_t v1640 = vld1_s16((const int16_t *)v1261); + int16x4_t v1642 = vld1_s16((const int16_t *)v1270); + int16x4_t v1644 = vld1_s16((const int16_t *)v1279); + int16x4_t v1648 = vld1_s16((const int16_t *)v1297); + int16x4_t v1650 = vld1_s16((const int16_t *)v1306); + int16x4_t v1652 = vld1_s16((const int16_t *)v1315); + int16x4_t v1654 = vld1_s16((const int16_t *)v1324); + int16x4_t v1656 = vld1_s16((const int16_t *)v1333); + int16x4_t v1658 = vld1_s16((const int16_t *)v1342); + int16x4_t v1660 = vld1_s16((const int16_t *)v1351); + int16x4_t v1662 = vld1_s16((const int16_t *)v1360); + int16x4_t v1664 = vld1_s16((const int16_t *)v1369); + int16x4_t v1666 = vld1_s16((const int16_t *)v1378); + int16x4_t v1668 = vld1_s16((const int16_t *)v1387); + int16x4_t v1670 = vld1_s16((const int16_t *)v1396); + int16x4_t v1672 = vld1_s16((const int16_t *)v1405); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1626), 15); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1628), 15); float32x4_t v55 = vcvtq_n_f32_s32(vmovl_s16(v1632), 15); @@ -13342,8 +13342,8 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu24(const armral_cmplx_int16_t *restrict x, float32x4_t v461 = vsubq_f32(v439, v457); float32x4_t v468 = vaddq_f32(v249, v332); float32x4_t v564 = vaddq_f32(v250, v337); - *(float32x4_t *)v1415 = v249; - *(float32x4_t *)v1523 = v250; + vst1q_f32((float32_t *)v1415, v249); + vst1q_f32((float32_t *)v1523, v250); float32x4_t v302 = vaddq_f32(v246, v275); float32x4_t v303 = vsubq_f32(v246, v275); float32x4_t v306 = vaddq_f32(v288, v296); @@ -13366,12 +13366,12 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu24(const armral_cmplx_int16_t *restrict x, float32x4_t v311 = vsubq_f32(v305, v307); float32x4_t v516 = vaddq_f32(v303, v378); float32x4_t v612 = vaddq_f32(v302, v377); - *(float32x4_t *)v1424 = v470; - *(float32x4_t *)v1433 = v469; - *(float32x4_t *)v1469 = v303; - *(float32x4_t *)v1532 = v566; - *(float32x4_t *)v1541 = v565; - *(float32x4_t *)v1577 = v302; + vst1q_f32((float32_t *)v1424, v470); + vst1q_f32((float32_t *)v1433, v469); + vst1q_f32((float32_t *)v1469, v303); + vst1q_f32((float32_t *)v1532, v566); + vst1q_f32((float32_t *)v1541, v565); + vst1q_f32((float32_t *)v1577, v302); float32x4_t v492 = vaddq_f32(v309, v384); float32x4_t v517 = vaddq_f32(v516, v459); float32x4_t v518 = vsubq_f32(v516, v459); @@ -13380,10 +13380,10 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu24(const armral_cmplx_int16_t *restrict x, float32x4_t v613 = vaddq_f32(v612, v458); float32x4_t v614 = vsubq_f32(v612, v458); float32x4_t v636 = vaddq_f32(v308, v383); - *(float32x4_t *)v1442 = v309; - *(float32x4_t *)v1496 = v310; - *(float32x4_t *)v1550 = v311; - *(float32x4_t *)v1604 = v308; + vst1q_f32((float32_t *)v1442, v309); + vst1q_f32((float32_t *)v1496, v310); + vst1q_f32((float32_t *)v1550, v311); + vst1q_f32((float32_t *)v1604, v308); float32x4_t v493 = vaddq_f32(v492, v465); float32x4_t v494 = vsubq_f32(v492, v465); float32x4_t v541 = vaddq_f32(v540, v466); @@ -13392,18 +13392,18 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu24(const armral_cmplx_int16_t *restrict x, float32x4_t v590 = vsubq_f32(v588, v467); float32x4_t v637 = vaddq_f32(v636, v464); float32x4_t v638 = vsubq_f32(v636, v464); - *(float32x4_t *)v1478 = v518; - *(float32x4_t *)v1487 = v517; - *(float32x4_t *)v1586 = v614; - *(float32x4_t *)v1595 = v613; - *(float32x4_t *)v1451 = v494; - *(float32x4_t *)v1460 = v493; - *(float32x4_t *)v1505 = v542; - *(float32x4_t *)v1514 = v541; - *(float32x4_t *)v1559 = v590; - *(float32x4_t *)v1568 = v589; - *(float32x4_t *)v1613 = v638; - *(float32x4_t *)v1622 = v637; + vst1q_f32((float32_t *)v1478, v518); + vst1q_f32((float32_t *)v1487, v517); + vst1q_f32((float32_t *)v1586, v614); + vst1q_f32((float32_t *)v1595, v613); + vst1q_f32((float32_t *)v1451, v494); + vst1q_f32((float32_t *)v1460, v493); + vst1q_f32((float32_t *)v1505, v542); + vst1q_f32((float32_t *)v1514, v541); + vst1q_f32((float32_t *)v1559, v590); + vst1q_f32((float32_t *)v1568, v589); + vst1q_f32((float32_t *)v1613, v638); + vst1q_f32((float32_t *)v1622, v637); v5 += 2 * 1; v6 += 2 * 1; } @@ -14304,7 +14304,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x2_t v1715 = (float32x2_t){v1714, v1714}; const int32_t *v3160 = &v5[0]; float32x2_t *v3386 = &v6[0]; - int16x4_t v3616 = *(const int16x4_t *)v3205; + int16x4_t v3616 = vld1_s16((const int16_t *)v3205); float32x4_t v201 = vcvtq_n_f32_s32(vmovl_s16(v3616), 15); float32x2_t v942 = (float32x2_t){v941, v944}; float32x4_t v1057 = vcombine_f32(v1056, v1056); @@ -14375,7 +14375,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x2_t *v3584 = &v6[ostride * 14]; float32x2_t *v3593 = &v6[ostride * 19]; float32x2_t *v3602 = &v6[ostride * 24]; - int16x4_t v3606 = *(const int16x4_t *)v3160; + int16x4_t v3606 = vld1_s16((const int16_t *)v3160); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v3606), 15); float32x4_t v946 = vcombine_f32(v942, v942); float32x4_t v1065 = vcombine_f32(v1063, v1063); @@ -14388,29 +14388,29 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x4_t v1597 = vcombine_f32(v1595, v1595); float32x4_t v1618 = vcombine_f32(v1616, v1616); float32x4_t v1690 = vcombine_f32(v1688, v1688); - int16x4_t v3608 = *(const int16x4_t *)v3169; - int16x4_t v3610 = *(const int16x4_t *)v3178; - int16x4_t v3612 = *(const int16x4_t *)v3187; - int16x4_t v3614 = *(const int16x4_t *)v3196; - int16x4_t v3618 = *(const int16x4_t *)v3214; - int16x4_t v3620 = *(const int16x4_t *)v3223; - int16x4_t v3622 = *(const int16x4_t *)v3232; - int16x4_t v3624 = *(const int16x4_t *)v3241; - int16x4_t v3626 = *(const int16x4_t *)v3250; - int16x4_t v3628 = *(const int16x4_t *)v3259; - int16x4_t v3630 = *(const int16x4_t *)v3268; - int16x4_t v3632 = *(const int16x4_t *)v3277; - int16x4_t v3634 = *(const int16x4_t *)v3286; - int16x4_t v3636 = *(const int16x4_t *)v3295; - int16x4_t v3638 = *(const int16x4_t *)v3304; - int16x4_t v3640 = *(const int16x4_t *)v3313; - int16x4_t v3642 = *(const int16x4_t *)v3322; - int16x4_t v3644 = *(const int16x4_t *)v3331; - int16x4_t v3646 = *(const int16x4_t *)v3340; - int16x4_t v3648 = *(const int16x4_t *)v3349; - int16x4_t v3650 = *(const int16x4_t *)v3358; - int16x4_t v3652 = *(const int16x4_t *)v3367; - int16x4_t v3654 = *(const int16x4_t *)v3376; + int16x4_t v3608 = vld1_s16((const int16_t *)v3169); + int16x4_t v3610 = vld1_s16((const int16_t *)v3178); + int16x4_t v3612 = vld1_s16((const int16_t *)v3187); + int16x4_t v3614 = vld1_s16((const int16_t *)v3196); + int16x4_t v3618 = vld1_s16((const int16_t *)v3214); + int16x4_t v3620 = vld1_s16((const int16_t *)v3223); + int16x4_t v3622 = vld1_s16((const int16_t *)v3232); + int16x4_t v3624 = vld1_s16((const int16_t *)v3241); + int16x4_t v3626 = vld1_s16((const int16_t *)v3250); + int16x4_t v3628 = vld1_s16((const int16_t *)v3259); + int16x4_t v3630 = vld1_s16((const int16_t *)v3268); + int16x4_t v3632 = vld1_s16((const int16_t *)v3277); + int16x4_t v3634 = vld1_s16((const int16_t *)v3286); + int16x4_t v3636 = vld1_s16((const int16_t *)v3295); + int16x4_t v3638 = vld1_s16((const int16_t *)v3304); + int16x4_t v3640 = vld1_s16((const int16_t *)v3313); + int16x4_t v3642 = vld1_s16((const int16_t *)v3322); + int16x4_t v3644 = vld1_s16((const int16_t *)v3331); + int16x4_t v3646 = vld1_s16((const int16_t *)v3340); + int16x4_t v3648 = vld1_s16((const int16_t *)v3349); + int16x4_t v3650 = vld1_s16((const int16_t *)v3358); + int16x4_t v3652 = vld1_s16((const int16_t *)v3367); + int16x4_t v3654 = vld1_s16((const int16_t *)v3376); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v3608), 15); float32x4_t v44 = vcvtq_n_f32_s32(vmovl_s16(v3610), 15); float32x4_t v52 = vcvtq_n_f32_s32(vmovl_s16(v3612), 15); @@ -14735,7 +14735,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x4_t v1585 = vfmaq_f32(v1584, v539, v1575); float32x4_t v1599 = vfmaq_f32(v1598, v885, v1589); float32x4_t v1620 = vfmaq_f32(v1619, v712, v1610); - *(float32x4_t *)v3386 = v988; + vst1q_f32((float32_t *)v3386, v988); float32x4_t v975 = vsubq_f32(v963, v974); float32x4_t v980 = vmulq_f32(v963, v1716); float32x4_t v1102 = vsubq_f32(v1101, v1096); @@ -14794,8 +14794,8 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x4_t v1629 = vsubq_f32(v1606, v1627); float32x4_t v1641 = vaddq_f32(v1600, v1640); float32x4_t v1659 = vsubq_f32(v1658, v1621); - *(float32x4_t *)v3404 = v1020; - *(float32x4_t *)v3476 = v1324; + vst1q_f32((float32_t *)v3404, v1020); + vst1q_f32((float32_t *)v3476, v1324); float32x4_t v1046 = vsubq_f32(v1045, v1004); float32x4_t v1131 = vsubq_f32(v172, v1130); float32x4_t v1171 = vmulq_f32(v1169, v1690); @@ -14810,10 +14810,10 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x4_t v1660 = vaddq_f32(v193, v1628); float32x4_t v1673 = vrev64q_f32(v1641); float32x4_t v1689 = vrev64q_f32(v1659); - *(float32x4_t *)v3395 = v1004; - *(float32x4_t *)v3413 = v1033; - *(float32x4_t *)v3431 = v1156; - *(float32x4_t *)v3521 = v1492; + vst1q_f32((float32_t *)v3395, v1004); + vst1q_f32((float32_t *)v3413, v1033); + vst1q_f32((float32_t *)v3431, v1156); + vst1q_f32((float32_t *)v3521, v1492); float32x4_t v1143 = vsubq_f32(v1131, v1142); float32x4_t v1148 = vmulq_f32(v1131, v1716); float32x4_t v1317 = vsubq_f32(v1316, v1311); @@ -14824,8 +14824,8 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x4_t v1635 = vsubq_f32(v193, v1634); float32x4_t v1675 = vmulq_f32(v1673, v1690); float32x4_t v1691 = vmulq_f32(v1689, v1690); - *(float32x4_t *)v3422 = v1046; - *(float32x4_t *)v3566 = v1660; + vst1q_f32((float32_t *)v3422, v1046); + vst1q_f32((float32_t *)v3566, v1660); float32x4_t v1149 = vsubq_f32(v1148, v1143); float32x4_t v1188 = vsubq_f32(v1143, v1187); float32x4_t v1200 = vmulq_f32(v1143, v1716); @@ -14837,7 +14837,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x4_t v1536 = vmulq_f32(v1479, v1716); float32x4_t v1647 = vsubq_f32(v1635, v1646); float32x4_t v1652 = vmulq_f32(v1635, v1716); - *(float32x4_t *)v3494 = v1356; + vst1q_f32((float32_t *)v3494, v1356); float32x4_t v1172 = vsubq_f32(v1149, v1171); float32x4_t v1201 = vsubq_f32(v1200, v1188); float32x4_t v1213 = vmulq_f32(v1149, v1716); @@ -14848,27 +14848,27 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x4_t v1653 = vsubq_f32(v1652, v1647); float32x4_t v1692 = vsubq_f32(v1647, v1691); float32x4_t v1704 = vmulq_f32(v1647, v1716); - *(float32x4_t *)v3449 = v1188; - *(float32x4_t *)v3485 = v1340; - *(float32x4_t *)v3503 = v1369; - *(float32x4_t *)v3539 = v1524; + vst1q_f32((float32_t *)v3449, v1188); + vst1q_f32((float32_t *)v3485, v1340); + vst1q_f32((float32_t *)v3503, v1369); + vst1q_f32((float32_t *)v3539, v1524); float32x4_t v1214 = vsubq_f32(v1213, v1172); float32x4_t v1550 = vsubq_f32(v1549, v1508); float32x4_t v1676 = vsubq_f32(v1653, v1675); float32x4_t v1705 = vsubq_f32(v1704, v1692); float32x4_t v1717 = vmulq_f32(v1653, v1716); - *(float32x4_t *)v3440 = v1172; - *(float32x4_t *)v3458 = v1201; - *(float32x4_t *)v3512 = v1382; - *(float32x4_t *)v3530 = v1508; - *(float32x4_t *)v3548 = v1537; - *(float32x4_t *)v3584 = v1692; + vst1q_f32((float32_t *)v3440, v1172); + vst1q_f32((float32_t *)v3458, v1201); + vst1q_f32((float32_t *)v3512, v1382); + vst1q_f32((float32_t *)v3530, v1508); + vst1q_f32((float32_t *)v3548, v1537); + vst1q_f32((float32_t *)v3584, v1692); float32x4_t v1718 = vsubq_f32(v1717, v1676); - *(float32x4_t *)v3467 = v1214; - *(float32x4_t *)v3557 = v1550; - *(float32x4_t *)v3575 = v1676; - *(float32x4_t *)v3593 = v1705; - *(float32x4_t *)v3602 = v1718; + vst1q_f32((float32_t *)v3467, v1214); + vst1q_f32((float32_t *)v3557, v1550); + vst1q_f32((float32_t *)v3575, v1676); + vst1q_f32((float32_t *)v3593, v1705); + vst1q_f32((float32_t *)v3602, v1718); v5 += 2 * 1; v6 += 2 * 1; } @@ -16171,7 +16171,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu32(const armral_cmplx_int16_t *restrict x, float32x2_t v1171 = (float32x2_t){v1169, v1170}; const int32_t *v2213 = &v5[0]; float32x2_t *v2502 = &v6[0]; - int16x4_t v2817 = *(const int16x4_t *)v2357; + int16x4_t v2817 = vld1_s16((const int16_t *)v2357); float32x4_t v404 = vcvtq_n_f32_s32(vmovl_s16(v2817), 15); float32x4_t v722 = vcombine_f32(v721, v721); float32x4_t v792 = vcombine_f32(v791, v791); @@ -16255,7 +16255,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu32(const armral_cmplx_int16_t *restrict x, float32x2_t *v2763 = &v6[ostride * 15]; float32x2_t *v2772 = &v6[ostride * 23]; float32x2_t *v2781 = &v6[ostride * 31]; - int16x4_t v2785 = *(const int16x4_t *)v2213; + int16x4_t v2785 = vld1_s16((const int16_t *)v2213); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v2785), 15); float32x4_t v800 = vcombine_f32(v798, v798); float32x4_t v870 = vcombine_f32(v868, v868); @@ -16267,36 +16267,36 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu32(const armral_cmplx_int16_t *restrict x, float32x4_t v1150 = vcombine_f32(v1148, v1148); float32x4_t v1163 = vcombine_f32(v1161, v1161); float32x4_t v1175 = vcombine_f32(v1173, v1173); - int16x4_t v2787 = *(const int16x4_t *)v2222; - int16x4_t v2789 = *(const int16x4_t *)v2231; - int16x4_t v2791 = *(const int16x4_t *)v2240; - int16x4_t v2793 = *(const int16x4_t *)v2249; - int16x4_t v2795 = *(const int16x4_t *)v2258; - int16x4_t v2797 = *(const int16x4_t *)v2267; - int16x4_t v2799 = *(const int16x4_t *)v2276; - int16x4_t v2801 = *(const int16x4_t *)v2285; - int16x4_t v2803 = *(const int16x4_t *)v2294; - int16x4_t v2805 = *(const int16x4_t *)v2303; - int16x4_t v2807 = *(const int16x4_t *)v2312; - int16x4_t v2809 = *(const int16x4_t *)v2321; - int16x4_t v2811 = *(const int16x4_t *)v2330; - int16x4_t v2813 = *(const int16x4_t *)v2339; - int16x4_t v2815 = *(const int16x4_t *)v2348; - int16x4_t v2819 = *(const int16x4_t *)v2366; - int16x4_t v2821 = *(const int16x4_t *)v2375; - int16x4_t v2823 = *(const int16x4_t *)v2384; - int16x4_t v2825 = *(const int16x4_t *)v2393; - int16x4_t v2827 = *(const int16x4_t *)v2402; - int16x4_t v2829 = *(const int16x4_t *)v2411; - int16x4_t v2831 = *(const int16x4_t *)v2420; - int16x4_t v2833 = *(const int16x4_t *)v2429; - int16x4_t v2835 = *(const int16x4_t *)v2438; - int16x4_t v2837 = *(const int16x4_t *)v2447; - int16x4_t v2839 = *(const int16x4_t *)v2456; - int16x4_t v2841 = *(const int16x4_t *)v2465; - int16x4_t v2843 = *(const int16x4_t *)v2474; - int16x4_t v2845 = *(const int16x4_t *)v2483; - int16x4_t v2847 = *(const int16x4_t *)v2492; + int16x4_t v2787 = vld1_s16((const int16_t *)v2222); + int16x4_t v2789 = vld1_s16((const int16_t *)v2231); + int16x4_t v2791 = vld1_s16((const int16_t *)v2240); + int16x4_t v2793 = vld1_s16((const int16_t *)v2249); + int16x4_t v2795 = vld1_s16((const int16_t *)v2258); + int16x4_t v2797 = vld1_s16((const int16_t *)v2267); + int16x4_t v2799 = vld1_s16((const int16_t *)v2276); + int16x4_t v2801 = vld1_s16((const int16_t *)v2285); + int16x4_t v2803 = vld1_s16((const int16_t *)v2294); + int16x4_t v2805 = vld1_s16((const int16_t *)v2303); + int16x4_t v2807 = vld1_s16((const int16_t *)v2312); + int16x4_t v2809 = vld1_s16((const int16_t *)v2321); + int16x4_t v2811 = vld1_s16((const int16_t *)v2330); + int16x4_t v2813 = vld1_s16((const int16_t *)v2339); + int16x4_t v2815 = vld1_s16((const int16_t *)v2348); + int16x4_t v2819 = vld1_s16((const int16_t *)v2366); + int16x4_t v2821 = vld1_s16((const int16_t *)v2375); + int16x4_t v2823 = vld1_s16((const int16_t *)v2384); + int16x4_t v2825 = vld1_s16((const int16_t *)v2393); + int16x4_t v2827 = vld1_s16((const int16_t *)v2402); + int16x4_t v2829 = vld1_s16((const int16_t *)v2411); + int16x4_t v2831 = vld1_s16((const int16_t *)v2420); + int16x4_t v2833 = vld1_s16((const int16_t *)v2429); + int16x4_t v2835 = vld1_s16((const int16_t *)v2438); + int16x4_t v2837 = vld1_s16((const int16_t *)v2447); + int16x4_t v2839 = vld1_s16((const int16_t *)v2456); + int16x4_t v2841 = vld1_s16((const int16_t *)v2465); + int16x4_t v2843 = vld1_s16((const int16_t *)v2474); + int16x4_t v2845 = vld1_s16((const int16_t *)v2483); + int16x4_t v2847 = vld1_s16((const int16_t *)v2492); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v2787), 15); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v2789), 15); float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v2791), 15); @@ -16487,8 +16487,8 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu32(const armral_cmplx_int16_t *restrict x, float32x4_t v1079 = vrev64q_f32(v494); float32x4_t v1086 = vmulq_f32(v634, v1085); float32x4_t v1092 = vrev64q_f32(v634); - *(float32x4_t *)v2502 = v687; - *(float32x4_t *)v2520 = v688; + vst1q_f32((float32_t *)v2502, v687); + vst1q_f32((float32_t *)v2520, v688); float32x4_t v154 = vrev64q_f32(v148); float32x4_t v157 = vaddq_f32(v67, v147); float32x4_t v158 = vsubq_f32(v67, v147); @@ -16532,8 +16532,8 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu32(const armral_cmplx_int16_t *restrict x, float32x4_t v1022 = vrev64q_f32(v674); float32x4_t v1095 = vfmaq_f32(v1073, v1079, v1080); float32x4_t v1096 = vfmaq_f32(v1086, v1092, v1093); - *(float32x4_t *)v2511 = v689; - *(float32x4_t *)v2529 = v690; + vst1q_f32((float32_t *)v2511, v689); + vst1q_f32((float32_t *)v2529, v690); float32x4_t v159 = vsubq_f32(v68, v156); float32x4_t v160 = vaddq_f32(v68, v156); float32x4_t v306 = vrev64q_f32(v300); @@ -16575,8 +16575,8 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu32(const armral_cmplx_int16_t *restrict x, float32x4_t v1149 = vrev64q_f32(v536); float32x4_t v1156 = vmulq_f32(v676, v1155); float32x4_t v1162 = vrev64q_f32(v676); - *(float32x4_t *)v2646 = v967; - *(float32x4_t *)v2664 = v968; + vst1q_f32((float32_t *)v2646, v967); + vst1q_f32((float32_t *)v2664, v968); float32x4_t v311 = vsubq_f32(v158, v308); float32x4_t v312 = vaddq_f32(v158, v308); float32x4_t v395 = vsubq_f32(v160, v392); @@ -16591,8 +16591,8 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu32(const armral_cmplx_int16_t *restrict x, float32x4_t v1106 = vmulq_f32(v1104, v1175); float32x4_t v1107 = vaddq_f32(v353, v1097); float32x4_t v1108 = vsubq_f32(v353, v1097); - *(float32x4_t *)v2574 = v827; - *(float32x4_t *)v2592 = v828; + vst1q_f32((float32_t *)v2574, v827); + vst1q_f32((float32_t *)v2592, v828); float32x4_t v754 = vrev64q_f32(v748); float32x4_t v757 = vaddq_f32(v309, v747); float32x4_t v758 = vsubq_f32(v309, v747); @@ -16607,24 +16607,24 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu32(const armral_cmplx_int16_t *restrict x, float32x4_t v1110 = vaddq_f32(v354, v1106); float32x4_t v1165 = vfmaq_f32(v1143, v1149, v1150); float32x4_t v1166 = vfmaq_f32(v1156, v1162, v1163); - *(float32x4_t *)v2655 = v969; - *(float32x4_t *)v2673 = v970; - *(float32x4_t *)v2718 = v1107; - *(float32x4_t *)v2736 = v1108; + vst1q_f32((float32_t *)v2655, v969); + vst1q_f32((float32_t *)v2673, v970); + vst1q_f32((float32_t *)v2718, v1107); + vst1q_f32((float32_t *)v2736, v1108); float32x4_t v756 = vmulq_f32(v754, v1175); float32x4_t v887 = vaddq_f32(v885, v886); float32x4_t v888 = vsubq_f32(v886, v885); float32x4_t v1036 = vmulq_f32(v1034, v1175); float32x4_t v1167 = vaddq_f32(v1165, v1166); float32x4_t v1168 = vsubq_f32(v1166, v1165); - *(float32x4_t *)v2538 = v757; - *(float32x4_t *)v2556 = v758; - *(float32x4_t *)v2583 = v829; - *(float32x4_t *)v2601 = v830; - *(float32x4_t *)v2682 = v1037; - *(float32x4_t *)v2700 = v1038; - *(float32x4_t *)v2727 = v1109; - *(float32x4_t *)v2745 = v1110; + vst1q_f32((float32_t *)v2538, v757); + vst1q_f32((float32_t *)v2556, v758); + vst1q_f32((float32_t *)v2583, v829); + vst1q_f32((float32_t *)v2601, v830); + vst1q_f32((float32_t *)v2682, v1037); + vst1q_f32((float32_t *)v2700, v1038); + vst1q_f32((float32_t *)v2727, v1109); + vst1q_f32((float32_t *)v2745, v1110); float32x4_t v759 = vsubq_f32(v310, v756); float32x4_t v760 = vaddq_f32(v310, v756); float32x4_t v894 = vrev64q_f32(v888); @@ -16637,22 +16637,22 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu32(const armral_cmplx_int16_t *restrict x, float32x4_t v1178 = vsubq_f32(v395, v1167); float32x4_t v896 = vmulq_f32(v894, v1175); float32x4_t v1176 = vmulq_f32(v1174, v1175); - *(float32x4_t *)v2547 = v759; - *(float32x4_t *)v2565 = v760; - *(float32x4_t *)v2610 = v897; - *(float32x4_t *)v2628 = v898; - *(float32x4_t *)v2691 = v1039; - *(float32x4_t *)v2709 = v1040; - *(float32x4_t *)v2754 = v1177; - *(float32x4_t *)v2772 = v1178; + vst1q_f32((float32_t *)v2547, v759); + vst1q_f32((float32_t *)v2565, v760); + vst1q_f32((float32_t *)v2610, v897); + vst1q_f32((float32_t *)v2628, v898); + vst1q_f32((float32_t *)v2691, v1039); + vst1q_f32((float32_t *)v2709, v1040); + vst1q_f32((float32_t *)v2754, v1177); + vst1q_f32((float32_t *)v2772, v1178); float32x4_t v899 = vsubq_f32(v394, v896); float32x4_t v900 = vaddq_f32(v394, v896); float32x4_t v1179 = vsubq_f32(v396, v1176); float32x4_t v1180 = vaddq_f32(v396, v1176); - *(float32x4_t *)v2619 = v899; - *(float32x4_t *)v2637 = v900; - *(float32x4_t *)v2763 = v1179; - *(float32x4_t *)v2781 = v1180; + vst1q_f32((float32_t *)v2619, v899); + vst1q_f32((float32_t *)v2637, v900); + vst1q_f32((float32_t *)v2763, v1179); + vst1q_f32((float32_t *)v2781, v1180); v5 += 2 * 1; v6 += 2 * 1; } diff --git a/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c b/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c index 5ccc3e69c18aa52d0f494c54e8f00681ccd43029..9323e563bdb2ff6a2c7581987bfeac58d1ee261e 100644 --- a/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c +++ b/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c @@ -23,16 +23,16 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu2(const armral_cmplx_int16_t *restrict x, int32_t *v146 = &v6[ostride]; const int32_t *v118 = &v5[0]; int32_t *v137 = &v6[0]; - int16x4_t v152 = *(const int16x4_t *)v127; + int16x4_t v152 = vld1_s16((const int16_t *)v127); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v152), 15); - int16x4_t v150 = *(const int16x4_t *)v118; + int16x4_t v150 = vld1_s16((const int16_t *)v118); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v150), 15); float32x4_t v37 = vaddq_f32(v28, v36); float32x4_t v38 = vsubq_f32(v28, v36); int16x4_t v51 = vqmovn_s32(vcvtq_n_s32_f32(v37, 15)); int16x4_t v59 = vqmovn_s32(vcvtq_n_s32_f32(v38, 15)); - *(int16x4_t *)v137 = v51; - *(int16x4_t *)v146 = v59; + vst1_s16((int16_t *)v137, v51); + vst1_s16((int16_t *)v146, v59); v5 += 2 * 1; v6 += 2 * 1; } @@ -127,16 +127,16 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu3(const armral_cmplx_int16_t *restrict x, float32x2_t v60 = (float32x2_t){v58, v59}; const int32_t *v187 = &v5[0]; int32_t *v197 = &v6[0]; - int16x4_t v219 = *(const int16x4_t *)v168; + int16x4_t v219 = vld1_s16((const int16_t *)v168); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v219), 15); float32x4_t v56 = vcombine_f32(v55, v55); float32x2_t v62 = vmul_f32(v61, v60); const int32_t *v177 = &v5[istride * 2]; int32_t *v215 = &v6[ostride * 2]; - int16x4_t v223 = *(const int16x4_t *)v187; + int16x4_t v223 = vld1_s16((const int16_t *)v187); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v223), 15); float32x4_t v64 = vcombine_f32(v62, v62); - int16x4_t v221 = *(const int16x4_t *)v177; + int16x4_t v221 = vld1_s16((const int16_t *)v177); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v221), 15); float32x4_t v37 = vaddq_f32(v28, v36); float32x4_t v38 = vsubq_f32(v28, v36); @@ -148,11 +148,11 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu3(const armral_cmplx_int16_t *restrict x, int16x4_t v71 = vqmovn_s32(vcvtq_n_s32_f32(v47, 15)); float32x4_t v67 = vaddq_f32(v66, v65); float32x4_t v68 = vsubq_f32(v66, v65); - *(int16x4_t *)v197 = v71; + vst1_s16((int16_t *)v197, v71); int16x4_t v79 = vqmovn_s32(vcvtq_n_s32_f32(v68, 15)); int16x4_t v87 = vqmovn_s32(vcvtq_n_s32_f32(v67, 15)); - *(int16x4_t *)v206 = v79; - *(int16x4_t *)v215 = v87; + vst1_s16((int16_t *)v206, v79); + vst1_s16((int16_t *)v215, v87); v5 += 2 * 1; v6 += 2 * 1; } @@ -295,18 +295,18 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu4(const armral_cmplx_int16_t *restrict x, float32x2_t v76 = (float32x2_t){v74, v75}; const int32_t *v210 = &v5[0]; int32_t *v247 = &v6[0]; - int16x4_t v282 = *(const int16x4_t *)v228; + int16x4_t v282 = vld1_s16((const int16_t *)v228); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v282), 15); float32x2_t v78 = vmul_f32(v77, v76); const int32_t *v219 = &v5[istride * 2]; const int32_t *v237 = &v5[istride * 3]; int32_t *v265 = &v6[ostride * 2]; int32_t *v274 = &v6[ostride * 3]; - int16x4_t v278 = *(const int16x4_t *)v210; + int16x4_t v278 = vld1_s16((const int16_t *)v210); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v278), 15); float32x4_t v80 = vcombine_f32(v78, v78); - int16x4_t v280 = *(const int16x4_t *)v219; - int16x4_t v284 = *(const int16x4_t *)v237; + int16x4_t v280 = vld1_s16((const int16_t *)v219); + int16x4_t v284 = vld1_s16((const int16_t *)v237); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v280), 15); float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v284), 15); float32x4_t v37 = vaddq_f32(v28, v36); @@ -321,12 +321,12 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu4(const armral_cmplx_int16_t *restrict x, int16x4_t v102 = vqmovn_s32(vcvtq_n_s32_f32(v58, 15)); float32x4_t v82 = vaddq_f32(v38, v81); float32x4_t v83 = vsubq_f32(v38, v81); - *(int16x4_t *)v247 = v86; - *(int16x4_t *)v265 = v102; + vst1_s16((int16_t *)v247, v86); + vst1_s16((int16_t *)v265, v102); int16x4_t v94 = vqmovn_s32(vcvtq_n_s32_f32(v83, 15)); int16x4_t v110 = vqmovn_s32(vcvtq_n_s32_f32(v82, 15)); - *(int16x4_t *)v256 = v94; - *(int16x4_t *)v274 = v110; + vst1_s16((int16_t *)v256, v94); + vst1_s16((int16_t *)v274, v110); v5 += 2 * 1; v6 += 2 * 1; } @@ -501,7 +501,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu5(const armral_cmplx_int16_t *restrict x, float32x2_t v102 = (float32x2_t){v100, v101}; const int32_t *v322 = &v5[0]; int32_t *v332 = &v6[0]; - int16x4_t v372 = *(const int16x4_t *)v285; + int16x4_t v372 = vld1_s16((const int16_t *)v285); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v372), 15); float32x4_t v77 = vcombine_f32(v76, v76); float32x4_t v82 = vcombine_f32(v81, v81); @@ -514,14 +514,14 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu5(const armral_cmplx_int16_t *restrict x, int32_t *v350 = &v6[ostride * 2]; int32_t *v359 = &v6[ostride * 3]; int32_t *v368 = &v6[ostride * 4]; - int16x4_t v380 = *(const int16x4_t *)v322; + int16x4_t v380 = vld1_s16((const int16_t *)v322); float32x4_t v67 = vcvtq_n_f32_s32(vmovl_s16(v380), 15); float32x4_t v90 = vcombine_f32(v88, v88); float32x4_t v98 = vcombine_f32(v96, v96); float32x4_t v106 = vcombine_f32(v104, v104); - int16x4_t v374 = *(const int16x4_t *)v294; - int16x4_t v376 = *(const int16x4_t *)v303; - int16x4_t v378 = *(const int16x4_t *)v312; + int16x4_t v374 = vld1_s16((const int16_t *)v294); + int16x4_t v376 = vld1_s16((const int16_t *)v303); + int16x4_t v378 = vld1_s16((const int16_t *)v312); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v374), 15); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v376), 15); float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v378), 15); @@ -547,7 +547,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu5(const armral_cmplx_int16_t *restrict x, float32x4_t v110 = vsubq_f32(v108, v83); float32x4_t v111 = vsubq_f32(v91, v99); float32x4_t v112 = vaddq_f32(v99, v107); - *(int16x4_t *)v332 = v119; + vst1_s16((int16_t *)v332, v119); float32x4_t v113 = vaddq_f32(v109, v111); float32x4_t v114 = vsubq_f32(v109, v111); float32x4_t v115 = vaddq_f32(v110, v112); @@ -556,10 +556,10 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu5(const armral_cmplx_int16_t *restrict x, int16x4_t v135 = vqmovn_s32(vcvtq_n_s32_f32(v116, 15)); int16x4_t v143 = vqmovn_s32(vcvtq_n_s32_f32(v115, 15)); int16x4_t v151 = vqmovn_s32(vcvtq_n_s32_f32(v113, 15)); - *(int16x4_t *)v341 = v127; - *(int16x4_t *)v350 = v135; - *(int16x4_t *)v359 = v143; - *(int16x4_t *)v368 = v151; + vst1_s16((int16_t *)v341, v127); + vst1_s16((int16_t *)v350, v135); + vst1_s16((int16_t *)v359, v143); + vst1_s16((int16_t *)v368, v151); v5 += 2 * 1; v6 += 2 * 1; } @@ -800,7 +800,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu6(const armral_cmplx_int16_t *restrict x, float32x2_t v114 = (float32x2_t){v112, v113}; const int32_t *v310 = &v5[0]; int32_t *v365 = &v6[0]; - int16x4_t v424 = *(const int16x4_t *)v355; + int16x4_t v424 = vld1_s16((const int16_t *)v355); float32x4_t v72 = vcvtq_n_f32_s32(vmovl_s16(v424), 15); float32x4_t v110 = vcombine_f32(v109, v109); float32x2_t v116 = vmul_f32(v115, v114); @@ -812,13 +812,13 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu6(const armral_cmplx_int16_t *restrict x, int32_t *v383 = &v6[ostride * 4]; int32_t *v401 = &v6[ostride * 2]; int32_t *v410 = &v6[ostride * 5]; - int16x4_t v414 = *(const int16x4_t *)v310; + int16x4_t v414 = vld1_s16((const int16_t *)v310); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v414), 15); float32x4_t v118 = vcombine_f32(v116, v116); - int16x4_t v416 = *(const int16x4_t *)v319; - int16x4_t v418 = *(const int16x4_t *)v328; - int16x4_t v420 = *(const int16x4_t *)v337; - int16x4_t v422 = *(const int16x4_t *)v346; + int16x4_t v416 = vld1_s16((const int16_t *)v319); + int16x4_t v418 = vld1_s16((const int16_t *)v328); + int16x4_t v420 = vld1_s16((const int16_t *)v337); + int16x4_t v422 = vld1_s16((const int16_t *)v346); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v416), 15); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v418), 15); float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v420), 15); @@ -849,16 +849,16 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu6(const armral_cmplx_int16_t *restrict x, float32x4_t v98 = vsubq_f32(v96, v95); float32x4_t v121 = vaddq_f32(v120, v119); float32x4_t v122 = vsubq_f32(v120, v119); - *(int16x4_t *)v365 = v125; - *(int16x4_t *)v374 = v133; + vst1_s16((int16_t *)v365, v125); + vst1_s16((int16_t *)v374, v133); int16x4_t v141 = vqmovn_s32(vcvtq_n_s32_f32(v98, 15)); int16x4_t v149 = vqmovn_s32(vcvtq_n_s32_f32(v122, 15)); int16x4_t v157 = vqmovn_s32(vcvtq_n_s32_f32(v97, 15)); int16x4_t v165 = vqmovn_s32(vcvtq_n_s32_f32(v121, 15)); - *(int16x4_t *)v383 = v141; - *(int16x4_t *)v392 = v149; - *(int16x4_t *)v401 = v157; - *(int16x4_t *)v410 = v165; + vst1_s16((int16_t *)v383, v141); + vst1_s16((int16_t *)v392, v149); + vst1_s16((int16_t *)v401, v157); + vst1_s16((int16_t *)v410, v165); v5 += 2 * 1; v6 += 2 * 1; } @@ -1119,7 +1119,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu7(const armral_cmplx_int16_t *restrict x, float32x2_t v145 = (float32x2_t){v143, v144}; const int32_t *v467 = &v5[0]; int32_t *v477 = &v6[0]; - int16x4_t v535 = *(const int16x4_t *)v412; + int16x4_t v535 = vld1_s16((const int16_t *)v412); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v535), 15); float32x4_t v102 = vcombine_f32(v101, v101); float32x4_t v107 = vcombine_f32(v106, v106); @@ -1139,17 +1139,17 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu7(const armral_cmplx_int16_t *restrict x, int32_t *v513 = &v6[ostride * 4]; int32_t *v522 = &v6[ostride * 5]; int32_t *v531 = &v6[ostride * 6]; - int16x4_t v547 = *(const int16x4_t *)v467; + int16x4_t v547 = vld1_s16((const int16_t *)v467); float32x4_t v84 = vcvtq_n_f32_s32(vmovl_s16(v547), 15); float32x4_t v125 = vcombine_f32(v123, v123); float32x4_t v133 = vcombine_f32(v131, v131); float32x4_t v141 = vcombine_f32(v139, v139); float32x4_t v149 = vcombine_f32(v147, v147); - int16x4_t v537 = *(const int16x4_t *)v421; - int16x4_t v539 = *(const int16x4_t *)v430; - int16x4_t v541 = *(const int16x4_t *)v439; - int16x4_t v543 = *(const int16x4_t *)v448; - int16x4_t v545 = *(const int16x4_t *)v457; + int16x4_t v537 = vld1_s16((const int16_t *)v421); + int16x4_t v539 = vld1_s16((const int16_t *)v430); + int16x4_t v541 = vld1_s16((const int16_t *)v439); + int16x4_t v543 = vld1_s16((const int16_t *)v448); + int16x4_t v545 = vld1_s16((const int16_t *)v457); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v537), 15); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v539), 15); float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v541), 15); @@ -1192,7 +1192,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu7(const armral_cmplx_int16_t *restrict x, float32x4_t v158 = vaddq_f32(v126, v134); float32x4_t v160 = vsubq_f32(v126, v134); float32x4_t v162 = vsubq_f32(v126, v142); - *(int16x4_t *)v477 = v172; + vst1_s16((int16_t *)v477, v172); float32x4_t v153 = vaddq_f32(v152, v113); float32x4_t v155 = vsubq_f32(v154, v118); float32x4_t v157 = vaddq_f32(v156, v118); @@ -1211,12 +1211,12 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu7(const armral_cmplx_int16_t *restrict x, int16x4_t v204 = vqmovn_s32(vcvtq_n_s32_f32(v169, 15)); int16x4_t v212 = vqmovn_s32(vcvtq_n_s32_f32(v166, 15)); int16x4_t v220 = vqmovn_s32(vcvtq_n_s32_f32(v164, 15)); - *(int16x4_t *)v486 = v180; - *(int16x4_t *)v495 = v188; - *(int16x4_t *)v504 = v196; - *(int16x4_t *)v513 = v204; - *(int16x4_t *)v522 = v212; - *(int16x4_t *)v531 = v220; + vst1_s16((int16_t *)v486, v180); + vst1_s16((int16_t *)v495, v188); + vst1_s16((int16_t *)v504, v196); + vst1_s16((int16_t *)v513, v204); + vst1_s16((int16_t *)v522, v212); + vst1_s16((int16_t *)v531, v220); v5 += 2 * 1; v6 += 2 * 1; } @@ -1578,7 +1578,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu8(const armral_cmplx_int16_t *restrict x, float32x2_t v147 = (float32x2_t){v146, v146}; const int32_t *v406 = &v5[0]; int32_t *v479 = &v6[0]; - int16x4_t v554 = *(const int16x4_t *)v442; + int16x4_t v554 = vld1_s16((const int16_t *)v442); float32x4_t v64 = vcvtq_n_f32_s32(vmovl_s16(v554), 15); float32x2_t v133 = vmul_f32(v140, v131); float32x2_t v141 = vmul_f32(v140, v139); @@ -1595,16 +1595,16 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu8(const armral_cmplx_int16_t *restrict x, int32_t *v524 = &v6[ostride * 5]; int32_t *v533 = &v6[ostride * 6]; int32_t *v542 = &v6[ostride * 7]; - int16x4_t v546 = *(const int16x4_t *)v406; + int16x4_t v546 = vld1_s16((const int16_t *)v406); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v546), 15); float32x4_t v135 = vcombine_f32(v133, v133); float32x4_t v143 = vcombine_f32(v141, v141); - int16x4_t v548 = *(const int16x4_t *)v415; - int16x4_t v550 = *(const int16x4_t *)v424; - int16x4_t v552 = *(const int16x4_t *)v433; - int16x4_t v556 = *(const int16x4_t *)v451; - int16x4_t v558 = *(const int16x4_t *)v460; - int16x4_t v560 = *(const int16x4_t *)v469; + int16x4_t v548 = vld1_s16((const int16_t *)v415); + int16x4_t v550 = vld1_s16((const int16_t *)v424); + int16x4_t v552 = vld1_s16((const int16_t *)v433); + int16x4_t v556 = vld1_s16((const int16_t *)v451); + int16x4_t v558 = vld1_s16((const int16_t *)v460); + int16x4_t v560 = vld1_s16((const int16_t *)v469); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v548), 15); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v550), 15); float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v552), 15); @@ -1642,8 +1642,8 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu8(const armral_cmplx_int16_t *restrict x, float32x4_t v151 = vsubq_f32(v94, v123); float32x4_t v154 = vaddq_f32(v136, v144); float32x4_t v155 = vsubq_f32(v136, v144); - *(int16x4_t *)v479 = v162; - *(int16x4_t *)v515 = v194; + vst1_s16((int16_t *)v479, v162); + vst1_s16((int16_t *)v515, v194); float32x4_t v156 = vaddq_f32(v152, v154); float32x4_t v157 = vsubq_f32(v152, v154); float32x4_t v158 = vaddq_f32(v153, v155); @@ -1654,12 +1654,12 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu8(const armral_cmplx_int16_t *restrict x, int16x4_t v186 = vqmovn_s32(vcvtq_n_s32_f32(v158, 15)); int16x4_t v202 = vqmovn_s32(vcvtq_n_s32_f32(v159, 15)); int16x4_t v218 = vqmovn_s32(vcvtq_n_s32_f32(v156, 15)); - *(int16x4_t *)v497 = v178; - *(int16x4_t *)v533 = v210; - *(int16x4_t *)v488 = v170; - *(int16x4_t *)v506 = v186; - *(int16x4_t *)v524 = v202; - *(int16x4_t *)v542 = v218; + vst1_s16((int16_t *)v497, v178); + vst1_s16((int16_t *)v533, v210); + vst1_s16((int16_t *)v488, v170); + vst1_s16((int16_t *)v506, v186); + vst1_s16((int16_t *)v524, v202); + vst1_s16((int16_t *)v542, v218); v5 += 2 * 1; v6 += 2 * 1; } @@ -1996,7 +1996,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu9(const armral_cmplx_int16_t *restrict x, float32x2_t v177 = (float32x2_t){v175, v176}; const int32_t *v583 = &v5[0]; int32_t *v593 = &v6[0]; - int16x4_t v669 = *(const int16x4_t *)v510; + int16x4_t v669 = vld1_s16((const int16_t *)v510); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v669), 15); float32x4_t v121 = vcombine_f32(v120, v120); float32x4_t v134 = vcombine_f32(v133, v133); @@ -2021,19 +2021,19 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu9(const armral_cmplx_int16_t *restrict x, int32_t *v647 = &v6[ostride * 6]; int32_t *v656 = &v6[ostride * 7]; int32_t *v665 = &v6[ostride * 8]; - int16x4_t v685 = *(const int16x4_t *)v583; + int16x4_t v685 = vld1_s16((const int16_t *)v583); float32x4_t v103 = vcvtq_n_f32_s32(vmovl_s16(v685), 15); float32x4_t v142 = vcombine_f32(v140, v140); float32x4_t v165 = vcombine_f32(v163, v163); float32x4_t v173 = vcombine_f32(v171, v171); float32x4_t v181 = vcombine_f32(v179, v179); - int16x4_t v671 = *(const int16x4_t *)v519; - int16x4_t v673 = *(const int16x4_t *)v528; - int16x4_t v675 = *(const int16x4_t *)v537; - int16x4_t v677 = *(const int16x4_t *)v546; - int16x4_t v679 = *(const int16x4_t *)v555; - int16x4_t v681 = *(const int16x4_t *)v564; - int16x4_t v683 = *(const int16x4_t *)v573; + int16x4_t v671 = vld1_s16((const int16_t *)v519); + int16x4_t v673 = vld1_s16((const int16_t *)v528); + int16x4_t v675 = vld1_s16((const int16_t *)v537); + int16x4_t v677 = vld1_s16((const int16_t *)v546); + int16x4_t v679 = vld1_s16((const int16_t *)v555); + int16x4_t v681 = vld1_s16((const int16_t *)v564); + int16x4_t v683 = vld1_s16((const int16_t *)v573); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v671), 15); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v673), 15); float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v675), 15); @@ -2088,7 +2088,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu9(const armral_cmplx_int16_t *restrict x, int16x4_t v210 = vqmovn_s32(vcvtq_n_s32_f32(v104, 15)); float32x4_t v185 = vaddq_f32(v104, v184); float32x4_t v189 = vaddq_f32(v188, v183); - *(int16x4_t *)v593 = v210; + vst1_s16((int16_t *)v593, v210); float32x4_t v186 = vaddq_f32(v185, v130); float32x4_t v187 = vsubq_f32(v185, v130); float32x4_t v190 = vaddq_f32(v189, v148); @@ -2105,20 +2105,20 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu9(const armral_cmplx_int16_t *restrict x, float32x4_t v205 = vsubq_f32(v193, v199); float32x4_t v206 = vaddq_f32(v195, v201); float32x4_t v207 = vsubq_f32(v195, v201); - *(int16x4_t *)v620 = v234; - *(int16x4_t *)v647 = v258; + vst1_s16((int16_t *)v620, v234); + vst1_s16((int16_t *)v647, v258); int16x4_t v218 = vqmovn_s32(vcvtq_n_s32_f32(v203, 15)); int16x4_t v226 = vqmovn_s32(vcvtq_n_s32_f32(v204, 15)); int16x4_t v242 = vqmovn_s32(vcvtq_n_s32_f32(v207, 15)); int16x4_t v250 = vqmovn_s32(vcvtq_n_s32_f32(v206, 15)); int16x4_t v266 = vqmovn_s32(vcvtq_n_s32_f32(v205, 15)); int16x4_t v274 = vqmovn_s32(vcvtq_n_s32_f32(v202, 15)); - *(int16x4_t *)v602 = v218; - *(int16x4_t *)v611 = v226; - *(int16x4_t *)v629 = v242; - *(int16x4_t *)v638 = v250; - *(int16x4_t *)v656 = v266; - *(int16x4_t *)v665 = v274; + vst1_s16((int16_t *)v602, v218); + vst1_s16((int16_t *)v611, v226); + vst1_s16((int16_t *)v629, v242); + vst1_s16((int16_t *)v638, v250); + vst1_s16((int16_t *)v656, v266); + vst1_s16((int16_t *)v665, v274); v5 += 2 * 1; v6 += 2 * 1; } @@ -2564,7 +2564,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu10(const armral_cmplx_int16_t *restrict x, float32x2_t v208 = (float32x2_t){v206, v207}; const int32_t *v552 = &v5[0]; int32_t *v643 = &v6[0]; - int16x4_t v742 = *(const int16x4_t *)v615; + int16x4_t v742 = vld1_s16((const int16_t *)v615); float32x4_t v90 = vcvtq_n_f32_s32(vmovl_s16(v742), 15); float32x4_t v183 = vcombine_f32(v182, v182); float32x4_t v188 = vcombine_f32(v187, v187); @@ -2587,19 +2587,19 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu10(const armral_cmplx_int16_t *restrict x, int32_t *v706 = &v6[ostride * 3]; int32_t *v715 = &v6[ostride * 4]; int32_t *v724 = &v6[ostride * 9]; - int16x4_t v728 = *(const int16x4_t *)v552; + int16x4_t v728 = vld1_s16((const int16_t *)v552); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v728), 15); float32x4_t v196 = vcombine_f32(v194, v194); float32x4_t v204 = vcombine_f32(v202, v202); float32x4_t v212 = vcombine_f32(v210, v210); - int16x4_t v730 = *(const int16x4_t *)v561; - int16x4_t v732 = *(const int16x4_t *)v570; - int16x4_t v734 = *(const int16x4_t *)v579; - int16x4_t v736 = *(const int16x4_t *)v588; - int16x4_t v738 = *(const int16x4_t *)v597; - int16x4_t v740 = *(const int16x4_t *)v606; - int16x4_t v744 = *(const int16x4_t *)v624; - int16x4_t v746 = *(const int16x4_t *)v633; + int16x4_t v730 = vld1_s16((const int16_t *)v561); + int16x4_t v732 = vld1_s16((const int16_t *)v570); + int16x4_t v734 = vld1_s16((const int16_t *)v579); + int16x4_t v736 = vld1_s16((const int16_t *)v588); + int16x4_t v738 = vld1_s16((const int16_t *)v597); + int16x4_t v740 = vld1_s16((const int16_t *)v606); + int16x4_t v744 = vld1_s16((const int16_t *)v624); + int16x4_t v746 = vld1_s16((const int16_t *)v633); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v730), 15); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v732), 15); float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v734), 15); @@ -2662,8 +2662,8 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu10(const armral_cmplx_int16_t *restrict x, float32x4_t v216 = vsubq_f32(v214, v189); float32x4_t v217 = vsubq_f32(v197, v205); float32x4_t v218 = vaddq_f32(v205, v213); - *(int16x4_t *)v643 = v225; - *(int16x4_t *)v652 = v233; + vst1_s16((int16_t *)v643, v225); + vst1_s16((int16_t *)v652, v233); float32x4_t v163 = vaddq_f32(v159, v161); float32x4_t v164 = vsubq_f32(v159, v161); float32x4_t v165 = vaddq_f32(v160, v162); @@ -2680,14 +2680,14 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu10(const armral_cmplx_int16_t *restrict x, int16x4_t v281 = vqmovn_s32(vcvtq_n_s32_f32(v221, 15)); int16x4_t v289 = vqmovn_s32(vcvtq_n_s32_f32(v163, 15)); int16x4_t v297 = vqmovn_s32(vcvtq_n_s32_f32(v219, 15)); - *(int16x4_t *)v661 = v241; - *(int16x4_t *)v670 = v249; - *(int16x4_t *)v679 = v257; - *(int16x4_t *)v688 = v265; - *(int16x4_t *)v697 = v273; - *(int16x4_t *)v706 = v281; - *(int16x4_t *)v715 = v289; - *(int16x4_t *)v724 = v297; + vst1_s16((int16_t *)v661, v241); + vst1_s16((int16_t *)v670, v249); + vst1_s16((int16_t *)v679, v257); + vst1_s16((int16_t *)v688, v265); + vst1_s16((int16_t *)v697, v273); + vst1_s16((int16_t *)v706, v281); + vst1_s16((int16_t *)v715, v289); + vst1_s16((int16_t *)v724, v297); v5 += 2 * 1; v6 += 2 * 1; } @@ -3164,7 +3164,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu11(const armral_cmplx_int16_t *restrict x, float32x2_t v275 = (float32x2_t){v273, v274}; const int32_t *v855 = &v5[0]; int32_t *v865 = &v6[0]; - int16x4_t v959 = *(const int16x4_t *)v764; + int16x4_t v959 = vld1_s16((const int16_t *)v764); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v959), 15); float32x4_t v154 = vcombine_f32(v153, v153); float32x2_t v160 = vmul_f32(v276, v158); @@ -3204,7 +3204,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu11(const armral_cmplx_int16_t *restrict x, int32_t *v928 = &v6[ostride * 4]; int32_t *v937 = &v6[ostride * 3]; int32_t *v946 = &v6[ostride * 2]; - int16x4_t v979 = *(const int16x4_t *)v855; + int16x4_t v979 = vld1_s16((const int16_t *)v855); float32x4_t v124 = vcvtq_n_f32_s32(vmovl_s16(v979), 15); float32x4_t v162 = vcombine_f32(v160, v160); float32x4_t v215 = vcombine_f32(v213, v213); @@ -3216,15 +3216,15 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu11(const armral_cmplx_int16_t *restrict x, float32x4_t v263 = vcombine_f32(v261, v261); float32x4_t v271 = vcombine_f32(v269, v269); float32x4_t v279 = vcombine_f32(v277, v277); - int16x4_t v961 = *(const int16x4_t *)v773; - int16x4_t v963 = *(const int16x4_t *)v782; - int16x4_t v965 = *(const int16x4_t *)v791; - int16x4_t v967 = *(const int16x4_t *)v800; - int16x4_t v969 = *(const int16x4_t *)v809; - int16x4_t v971 = *(const int16x4_t *)v818; - int16x4_t v973 = *(const int16x4_t *)v827; - int16x4_t v975 = *(const int16x4_t *)v836; - int16x4_t v977 = *(const int16x4_t *)v845; + int16x4_t v961 = vld1_s16((const int16_t *)v773); + int16x4_t v963 = vld1_s16((const int16_t *)v782); + int16x4_t v965 = vld1_s16((const int16_t *)v791); + int16x4_t v967 = vld1_s16((const int16_t *)v800); + int16x4_t v969 = vld1_s16((const int16_t *)v809); + int16x4_t v971 = vld1_s16((const int16_t *)v818); + int16x4_t v973 = vld1_s16((const int16_t *)v827); + int16x4_t v975 = vld1_s16((const int16_t *)v836); + int16x4_t v977 = vld1_s16((const int16_t *)v845); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v961), 15); float32x4_t v45 = vcvtq_n_f32_s32(vmovl_s16(v963), 15); float32x4_t v53 = vcvtq_n_f32_s32(vmovl_s16(v965), 15); @@ -3329,7 +3329,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu11(const armral_cmplx_int16_t *restrict x, float32x4_t v310 = vaddq_f32(v163, v295); float32x4_t v312 = vsubq_f32(v295, v291); float32x4_t v315 = vaddq_f32(v314, v292); - *(int16x4_t *)v865 = v330; + vst1_s16((int16_t *)v865, v330); float32x4_t v299 = vsubq_f32(v298, v288); float32x4_t v301 = vaddq_f32(v300, v289); float32x4_t v303 = vsubq_f32(v302, v289); @@ -3360,16 +3360,16 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu11(const armral_cmplx_int16_t *restrict x, int16x4_t v402 = vqmovn_s32(vcvtq_n_s32_f32(v326, 15)); int16x4_t v338 = vqmovn_s32(vcvtq_n_s32_f32(v318, 15)); int16x4_t v410 = vqmovn_s32(vcvtq_n_s32_f32(v327, 15)); - *(int16x4_t *)v883 = v346; - *(int16x4_t *)v892 = v354; - *(int16x4_t *)v901 = v362; - *(int16x4_t *)v910 = v370; - *(int16x4_t *)v919 = v378; - *(int16x4_t *)v928 = v386; - *(int16x4_t *)v937 = v394; - *(int16x4_t *)v946 = v402; - *(int16x4_t *)v874 = v338; - *(int16x4_t *)v955 = v410; + vst1_s16((int16_t *)v883, v346); + vst1_s16((int16_t *)v892, v354); + vst1_s16((int16_t *)v901, v362); + vst1_s16((int16_t *)v910, v370); + vst1_s16((int16_t *)v919, v378); + vst1_s16((int16_t *)v928, v386); + vst1_s16((int16_t *)v937, v394); + vst1_s16((int16_t *)v946, v402); + vst1_s16((int16_t *)v874, v338); + vst1_s16((int16_t *)v955, v410); v5 += 2 * 1; v6 += 2 * 1; } @@ -4057,7 +4057,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu12(const armral_cmplx_int16_t *restrict x, float32x2_t v223 = (float32x2_t){v222, v222}; const int32_t *v628 = &v5[0]; int32_t *v719 = &v6[0]; - int16x4_t v840 = *(const int16x4_t *)v691; + int16x4_t v840 = vld1_s16((const int16_t *)v691); float32x4_t v109 = vcvtq_n_f32_s32(vmovl_s16(v840), 15); float32x2_t v154 = vmul_f32(v216, v152); float32x4_t v179 = vcombine_f32(v178, v178); @@ -4084,21 +4084,21 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu12(const armral_cmplx_int16_t *restrict x, int32_t *v800 = &v6[ostride * 3]; int32_t *v809 = &v6[ostride * 7]; int32_t *v818 = &v6[ostride * 11]; - int16x4_t v826 = *(const int16x4_t *)v628; + int16x4_t v826 = vld1_s16((const int16_t *)v628); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v826), 15); float32x4_t v156 = vcombine_f32(v154, v154); float32x4_t v187 = vcombine_f32(v185, v185); float32x4_t v219 = vcombine_f32(v217, v217); - int16x4_t v822 = *(const int16x4_t *)v609; - int16x4_t v824 = *(const int16x4_t *)v618; - int16x4_t v828 = *(const int16x4_t *)v637; - int16x4_t v830 = *(const int16x4_t *)v646; - int16x4_t v832 = *(const int16x4_t *)v655; - int16x4_t v834 = *(const int16x4_t *)v664; - int16x4_t v836 = *(const int16x4_t *)v673; - int16x4_t v838 = *(const int16x4_t *)v682; - int16x4_t v842 = *(const int16x4_t *)v700; - int16x4_t v844 = *(const int16x4_t *)v709; + int16x4_t v822 = vld1_s16((const int16_t *)v609); + int16x4_t v824 = vld1_s16((const int16_t *)v618); + int16x4_t v828 = vld1_s16((const int16_t *)v637); + int16x4_t v830 = vld1_s16((const int16_t *)v646); + int16x4_t v832 = vld1_s16((const int16_t *)v655); + int16x4_t v834 = vld1_s16((const int16_t *)v664); + int16x4_t v836 = vld1_s16((const int16_t *)v673); + int16x4_t v838 = vld1_s16((const int16_t *)v682); + int16x4_t v842 = vld1_s16((const int16_t *)v700); + int16x4_t v844 = vld1_s16((const int16_t *)v709); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v822), 15); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v824), 15); float32x4_t v55 = vcvtq_n_f32_s32(vmovl_s16(v828), 15); @@ -4167,8 +4167,8 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu12(const armral_cmplx_int16_t *restrict x, float32x4_t v230 = vsubq_f32(v228, v204); float32x4_t v283 = vaddq_f32(v282, v212); float32x4_t v284 = vsubq_f32(v282, v212); - *(int16x4_t *)v719 = v233; - *(int16x4_t *)v773 = v287; + vst1_s16((int16_t *)v719, v233); + vst1_s16((int16_t *)v773, v287); int16x4_t v241 = vqmovn_s32(vcvtq_n_s32_f32(v230, 15)); int16x4_t v249 = vqmovn_s32(vcvtq_n_s32_f32(v229, 15)); float32x4_t v255 = vaddq_f32(v159, v190); @@ -4181,20 +4181,20 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu12(const armral_cmplx_int16_t *restrict x, float32x4_t v257 = vsubq_f32(v255, v227); float32x4_t v310 = vaddq_f32(v309, v226); float32x4_t v311 = vsubq_f32(v309, v226); - *(int16x4_t *)v728 = v241; - *(int16x4_t *)v737 = v249; - *(int16x4_t *)v746 = v260; - *(int16x4_t *)v782 = v295; - *(int16x4_t *)v791 = v303; - *(int16x4_t *)v800 = v314; + vst1_s16((int16_t *)v728, v241); + vst1_s16((int16_t *)v737, v249); + vst1_s16((int16_t *)v746, v260); + vst1_s16((int16_t *)v782, v295); + vst1_s16((int16_t *)v791, v303); + vst1_s16((int16_t *)v800, v314); int16x4_t v268 = vqmovn_s32(vcvtq_n_s32_f32(v257, 15)); int16x4_t v276 = vqmovn_s32(vcvtq_n_s32_f32(v256, 15)); int16x4_t v322 = vqmovn_s32(vcvtq_n_s32_f32(v311, 15)); int16x4_t v330 = vqmovn_s32(vcvtq_n_s32_f32(v310, 15)); - *(int16x4_t *)v755 = v268; - *(int16x4_t *)v764 = v276; - *(int16x4_t *)v809 = v322; - *(int16x4_t *)v818 = v330; + vst1_s16((int16_t *)v755, v268); + vst1_s16((int16_t *)v764, v276); + vst1_s16((int16_t *)v809, v322); + vst1_s16((int16_t *)v818, v330); v5 += 2 * 1; v6 += 2 * 1; } @@ -4725,7 +4725,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu13(const armral_cmplx_int16_t *restrict x, float32x2_t v303 = (float32x2_t){v301, v302}; const int32_t *v961 = &v5[0]; int32_t *v971 = &v6[0]; - int16x4_t v1083 = *(const int16x4_t *)v852; + int16x4_t v1083 = vld1_s16((const int16_t *)v852); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1083), 15); float32x4_t v176 = vcombine_f32(v175, v175); float32x4_t v181 = vcombine_f32(v180, v180); @@ -4769,7 +4769,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu13(const armral_cmplx_int16_t *restrict x, int32_t *v1052 = &v6[ostride * 4]; int32_t *v1061 = &v6[ostride * 3]; int32_t *v1070 = &v6[ostride * 2]; - int16x4_t v1107 = *(const int16x4_t *)v961; + int16x4_t v1107 = vld1_s16((const int16_t *)v961); float32x4_t v159 = vcvtq_n_f32_s32(vmovl_s16(v1107), 15); float32x4_t v189 = vcombine_f32(v187, v187); float32x4_t v197 = vcombine_f32(v195, v195); @@ -4783,17 +4783,17 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu13(const armral_cmplx_int16_t *restrict x, float32x4_t v291 = vcombine_f32(v289, v289); float32x4_t v299 = vcombine_f32(v297, v297); float32x4_t v307 = vcombine_f32(v305, v305); - int16x4_t v1085 = *(const int16x4_t *)v861; - int16x4_t v1087 = *(const int16x4_t *)v870; - int16x4_t v1089 = *(const int16x4_t *)v879; - int16x4_t v1091 = *(const int16x4_t *)v888; - int16x4_t v1093 = *(const int16x4_t *)v897; - int16x4_t v1095 = *(const int16x4_t *)v906; - int16x4_t v1097 = *(const int16x4_t *)v915; - int16x4_t v1099 = *(const int16x4_t *)v924; - int16x4_t v1101 = *(const int16x4_t *)v933; - int16x4_t v1103 = *(const int16x4_t *)v942; - int16x4_t v1105 = *(const int16x4_t *)v951; + int16x4_t v1085 = vld1_s16((const int16_t *)v861); + int16x4_t v1087 = vld1_s16((const int16_t *)v870); + int16x4_t v1089 = vld1_s16((const int16_t *)v879); + int16x4_t v1091 = vld1_s16((const int16_t *)v888); + int16x4_t v1093 = vld1_s16((const int16_t *)v897); + int16x4_t v1095 = vld1_s16((const int16_t *)v906); + int16x4_t v1097 = vld1_s16((const int16_t *)v915); + int16x4_t v1099 = vld1_s16((const int16_t *)v924); + int16x4_t v1101 = vld1_s16((const int16_t *)v933); + int16x4_t v1103 = vld1_s16((const int16_t *)v942); + int16x4_t v1105 = vld1_s16((const int16_t *)v951); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1085), 15); float32x4_t v45 = vcvtq_n_f32_s32(vmovl_s16(v1087), 15); float32x4_t v53 = vcvtq_n_f32_s32(vmovl_s16(v1089), 15); @@ -4902,7 +4902,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu13(const armral_cmplx_int16_t *restrict x, float32x4_t v340 = vaddq_f32(v322, v326); float32x4_t v342 = vaddq_f32(v324, v326); float32x4_t v344 = vsubq_f32(v323, v327); - *(int16x4_t *)v971 = v362; + vst1_s16((int16_t *)v971, v362); float32x4_t v317 = vaddq_f32(v316, v231); float32x4_t v319 = vsubq_f32(v318, v236); float32x4_t v321 = vaddq_f32(v320, v236); @@ -4945,18 +4945,18 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu13(const armral_cmplx_int16_t *restrict x, int16x4_t v442 = vqmovn_s32(vcvtq_n_s32_f32(v357, 15)); int16x4_t v450 = vqmovn_s32(vcvtq_n_s32_f32(v358, 15)); int16x4_t v458 = vqmovn_s32(vcvtq_n_s32_f32(v359, 15)); - *(int16x4_t *)v980 = v370; - *(int16x4_t *)v989 = v378; - *(int16x4_t *)v998 = v386; - *(int16x4_t *)v1007 = v394; - *(int16x4_t *)v1016 = v402; - *(int16x4_t *)v1025 = v410; - *(int16x4_t *)v1034 = v418; - *(int16x4_t *)v1043 = v426; - *(int16x4_t *)v1052 = v434; - *(int16x4_t *)v1061 = v442; - *(int16x4_t *)v1070 = v450; - *(int16x4_t *)v1079 = v458; + vst1_s16((int16_t *)v980, v370); + vst1_s16((int16_t *)v989, v378); + vst1_s16((int16_t *)v998, v386); + vst1_s16((int16_t *)v1007, v394); + vst1_s16((int16_t *)v1016, v402); + vst1_s16((int16_t *)v1025, v410); + vst1_s16((int16_t *)v1034, v418); + vst1_s16((int16_t *)v1043, v426); + vst1_s16((int16_t *)v1052, v434); + vst1_s16((int16_t *)v1061, v442); + vst1_s16((int16_t *)v1070, v450); + vst1_s16((int16_t *)v1079, v458); v5 += 2 * 1; v6 += 2 * 1; } @@ -5735,7 +5735,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu14(const armral_cmplx_int16_t *restrict x, float32x2_t v308 = (float32x2_t){v306, v307}; const int32_t *v814 = &v5[0]; int32_t *v941 = &v6[0]; - int16x4_t v1080 = *(const int16x4_t *)v895; + int16x4_t v1080 = vld1_s16((const int16_t *)v895); float32x4_t v108 = vcvtq_n_f32_s32(vmovl_s16(v1080), 15); float32x4_t v265 = vcombine_f32(v264, v264); float32x4_t v270 = vcombine_f32(v269, v269); @@ -5769,24 +5769,24 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu14(const armral_cmplx_int16_t *restrict x, int32_t *v1040 = &v6[ostride * 5]; int32_t *v1049 = &v6[ostride * 6]; int32_t *v1058 = &v6[ostride * 13]; - int16x4_t v1062 = *(const int16x4_t *)v814; + int16x4_t v1062 = vld1_s16((const int16_t *)v814); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1062), 15); float32x4_t v288 = vcombine_f32(v286, v286); float32x4_t v296 = vcombine_f32(v294, v294); float32x4_t v304 = vcombine_f32(v302, v302); float32x4_t v312 = vcombine_f32(v310, v310); - int16x4_t v1064 = *(const int16x4_t *)v823; - int16x4_t v1066 = *(const int16x4_t *)v832; - int16x4_t v1068 = *(const int16x4_t *)v841; - int16x4_t v1070 = *(const int16x4_t *)v850; - int16x4_t v1072 = *(const int16x4_t *)v859; - int16x4_t v1074 = *(const int16x4_t *)v868; - int16x4_t v1076 = *(const int16x4_t *)v877; - int16x4_t v1078 = *(const int16x4_t *)v886; - int16x4_t v1082 = *(const int16x4_t *)v904; - int16x4_t v1084 = *(const int16x4_t *)v913; - int16x4_t v1086 = *(const int16x4_t *)v922; - int16x4_t v1088 = *(const int16x4_t *)v931; + int16x4_t v1064 = vld1_s16((const int16_t *)v823); + int16x4_t v1066 = vld1_s16((const int16_t *)v832); + int16x4_t v1068 = vld1_s16((const int16_t *)v841); + int16x4_t v1070 = vld1_s16((const int16_t *)v850); + int16x4_t v1072 = vld1_s16((const int16_t *)v859); + int16x4_t v1074 = vld1_s16((const int16_t *)v868); + int16x4_t v1076 = vld1_s16((const int16_t *)v877); + int16x4_t v1078 = vld1_s16((const int16_t *)v886); + int16x4_t v1082 = vld1_s16((const int16_t *)v904); + int16x4_t v1084 = vld1_s16((const int16_t *)v913); + int16x4_t v1086 = vld1_s16((const int16_t *)v922); + int16x4_t v1088 = vld1_s16((const int16_t *)v931); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1064), 15); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v1066), 15); float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v1068), 15); @@ -5887,8 +5887,8 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu14(const armral_cmplx_int16_t *restrict x, float32x4_t v321 = vaddq_f32(v289, v297); float32x4_t v323 = vsubq_f32(v289, v297); float32x4_t v325 = vsubq_f32(v289, v305); - *(int16x4_t *)v941 = v335; - *(int16x4_t *)v950 = v343; + vst1_s16((int16_t *)v941, v335); + vst1_s16((int16_t *)v950, v343); float32x4_t v223 = vaddq_f32(v222, v183); float32x4_t v225 = vsubq_f32(v224, v188); float32x4_t v227 = vaddq_f32(v226, v188); @@ -5925,18 +5925,18 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu14(const armral_cmplx_int16_t *restrict x, int16x4_t v423 = vqmovn_s32(vcvtq_n_s32_f32(v329, 15)); int16x4_t v431 = vqmovn_s32(vcvtq_n_s32_f32(v234, 15)); int16x4_t v439 = vqmovn_s32(vcvtq_n_s32_f32(v327, 15)); - *(int16x4_t *)v959 = v351; - *(int16x4_t *)v968 = v359; - *(int16x4_t *)v977 = v367; - *(int16x4_t *)v986 = v375; - *(int16x4_t *)v995 = v383; - *(int16x4_t *)v1004 = v391; - *(int16x4_t *)v1013 = v399; - *(int16x4_t *)v1022 = v407; - *(int16x4_t *)v1031 = v415; - *(int16x4_t *)v1040 = v423; - *(int16x4_t *)v1049 = v431; - *(int16x4_t *)v1058 = v439; + vst1_s16((int16_t *)v959, v351); + vst1_s16((int16_t *)v968, v359); + vst1_s16((int16_t *)v977, v367); + vst1_s16((int16_t *)v986, v375); + vst1_s16((int16_t *)v995, v383); + vst1_s16((int16_t *)v1004, v391); + vst1_s16((int16_t *)v1013, v399); + vst1_s16((int16_t *)v1022, v407); + vst1_s16((int16_t *)v1031, v415); + vst1_s16((int16_t *)v1040, v423); + vst1_s16((int16_t *)v1049, v431); + vst1_s16((int16_t *)v1058, v439); v5 += 2 * 1; v6 += 2 * 1; } @@ -6641,7 +6641,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu15(const armral_cmplx_int16_t *restrict x, float32x2_t v312 = (float32x2_t){v311, v311}; const int32_t *v856 = &v5[0]; int32_t *v974 = &v6[0]; - int16x4_t v1118 = *(const int16x4_t *)v901; + int16x4_t v1118 = vld1_s16((const int16_t *)v901); float32x4_t v90 = vcvtq_n_f32_s32(vmovl_s16(v1118), 15); float32x4_t v172 = vcombine_f32(v171, v171); float32x4_t v177 = vcombine_f32(v176, v176); @@ -6686,7 +6686,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu15(const armral_cmplx_int16_t *restrict x, int32_t *v1082 = &v6[ostride * 9]; int32_t *v1091 = &v6[ostride * 4]; int32_t *v1100 = &v6[ostride * 14]; - int16x4_t v1108 = *(const int16x4_t *)v856; + int16x4_t v1108 = vld1_s16((const int16_t *)v856); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v1108), 15); float32x4_t v185 = vcombine_f32(v183, v183); float32x4_t v193 = vcombine_f32(v191, v191); @@ -6697,19 +6697,19 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu15(const armral_cmplx_int16_t *restrict x, float32x4_t v282 = vcombine_f32(v280, v280); float32x4_t v290 = vcombine_f32(v288, v288); float32x4_t v298 = vcombine_f32(v296, v296); - int16x4_t v1104 = *(const int16x4_t *)v837; - int16x4_t v1106 = *(const int16x4_t *)v846; - int16x4_t v1110 = *(const int16x4_t *)v865; - int16x4_t v1112 = *(const int16x4_t *)v874; - int16x4_t v1114 = *(const int16x4_t *)v883; - int16x4_t v1116 = *(const int16x4_t *)v892; - int16x4_t v1120 = *(const int16x4_t *)v910; - int16x4_t v1122 = *(const int16x4_t *)v919; - int16x4_t v1124 = *(const int16x4_t *)v928; - int16x4_t v1126 = *(const int16x4_t *)v937; - int16x4_t v1128 = *(const int16x4_t *)v946; - int16x4_t v1130 = *(const int16x4_t *)v955; - int16x4_t v1132 = *(const int16x4_t *)v964; + int16x4_t v1104 = vld1_s16((const int16_t *)v837); + int16x4_t v1106 = vld1_s16((const int16_t *)v846); + int16x4_t v1110 = vld1_s16((const int16_t *)v865); + int16x4_t v1112 = vld1_s16((const int16_t *)v874); + int16x4_t v1114 = vld1_s16((const int16_t *)v883); + int16x4_t v1116 = vld1_s16((const int16_t *)v892); + int16x4_t v1120 = vld1_s16((const int16_t *)v910); + int16x4_t v1122 = vld1_s16((const int16_t *)v919); + int16x4_t v1124 = vld1_s16((const int16_t *)v928); + int16x4_t v1126 = vld1_s16((const int16_t *)v937); + int16x4_t v1128 = vld1_s16((const int16_t *)v946); + int16x4_t v1130 = vld1_s16((const int16_t *)v955); + int16x4_t v1132 = vld1_s16((const int16_t *)v964); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1104), 15); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1106), 15); float32x4_t v55 = vcvtq_n_f32_s32(vmovl_s16(v1110), 15); @@ -6805,7 +6805,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu15(const armral_cmplx_int16_t *restrict x, float32x4_t v315 = vaddq_f32(v283, v291); float32x4_t v325 = vaddq_f32(v324, v283); float32x4_t v326 = vsubq_f32(v324, v283); - *(int16x4_t *)v974 = v329; + vst1_s16((int16_t *)v974, v329); float32x4_t v208 = vaddq_f32(v204, v206); float32x4_t v209 = vsubq_f32(v204, v206); float32x4_t v210 = vaddq_f32(v205, v207); @@ -6830,8 +6830,8 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu15(const armral_cmplx_int16_t *restrict x, int16x4_t v410 = vqmovn_s32(vcvtq_n_s32_f32(v210, 15)); float32x4_t v432 = vaddq_f32(v208, v264); int16x4_t v437 = vqmovn_s32(vcvtq_n_s32_f32(v208, 15)); - *(int16x4_t *)v983 = v337; - *(int16x4_t *)v992 = v345; + vst1_s16((int16_t *)v983, v337); + vst1_s16((int16_t *)v992, v345); float32x4_t v352 = vaddq_f32(v351, v321); float32x4_t v353 = vsubq_f32(v351, v321); float32x4_t v379 = vaddq_f32(v378, v323); @@ -6840,10 +6840,10 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu15(const armral_cmplx_int16_t *restrict x, float32x4_t v407 = vsubq_f32(v405, v322); float32x4_t v433 = vaddq_f32(v432, v320); float32x4_t v434 = vsubq_f32(v432, v320); - *(int16x4_t *)v1001 = v356; - *(int16x4_t *)v1028 = v383; - *(int16x4_t *)v1055 = v410; - *(int16x4_t *)v1082 = v437; + vst1_s16((int16_t *)v1001, v356); + vst1_s16((int16_t *)v1028, v383); + vst1_s16((int16_t *)v1055, v410); + vst1_s16((int16_t *)v1082, v437); int16x4_t v364 = vqmovn_s32(vcvtq_n_s32_f32(v353, 15)); int16x4_t v372 = vqmovn_s32(vcvtq_n_s32_f32(v352, 15)); int16x4_t v391 = vqmovn_s32(vcvtq_n_s32_f32(v380, 15)); @@ -6852,14 +6852,14 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu15(const armral_cmplx_int16_t *restrict x, int16x4_t v426 = vqmovn_s32(vcvtq_n_s32_f32(v406, 15)); int16x4_t v445 = vqmovn_s32(vcvtq_n_s32_f32(v434, 15)); int16x4_t v453 = vqmovn_s32(vcvtq_n_s32_f32(v433, 15)); - *(int16x4_t *)v1010 = v364; - *(int16x4_t *)v1019 = v372; - *(int16x4_t *)v1037 = v391; - *(int16x4_t *)v1046 = v399; - *(int16x4_t *)v1064 = v418; - *(int16x4_t *)v1073 = v426; - *(int16x4_t *)v1091 = v445; - *(int16x4_t *)v1100 = v453; + vst1_s16((int16_t *)v1010, v364); + vst1_s16((int16_t *)v1019, v372); + vst1_s16((int16_t *)v1037, v391); + vst1_s16((int16_t *)v1046, v399); + vst1_s16((int16_t *)v1064, v418); + vst1_s16((int16_t *)v1073, v426); + vst1_s16((int16_t *)v1091, v445); + vst1_s16((int16_t *)v1100, v453); v5 += 2 * 1; v6 += 2 * 1; } @@ -7601,7 +7601,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu16(const armral_cmplx_int16_t *restrict x, float32x2_t v300 = (float32x2_t){v299, v299}; const int32_t *v866 = &v5[0]; int32_t *v1011 = &v6[0]; - int16x4_t v1166 = *(const int16x4_t *)v938; + int16x4_t v1166 = vld1_s16((const int16_t *)v938); float32x4_t v100 = vcvtq_n_f32_s32(vmovl_s16(v1166), 15); float32x2_t v247 = vmul_f32(v283, v245); float32x2_t v255 = vmul_f32(v283, v253); @@ -7640,27 +7640,27 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu16(const armral_cmplx_int16_t *restrict x, int32_t *v1128 = &v6[ostride * 13]; int32_t *v1137 = &v6[ostride * 14]; int32_t *v1146 = &v6[ostride * 15]; - int16x4_t v1150 = *(const int16x4_t *)v866; + int16x4_t v1150 = vld1_s16((const int16_t *)v866); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1150), 15); float32x4_t v249 = vcombine_f32(v247, v247); float32x4_t v257 = vcombine_f32(v255, v255); float32x4_t v270 = vcombine_f32(v268, v268); float32x4_t v278 = vcombine_f32(v276, v276); float32x4_t v286 = vcombine_f32(v284, v284); - int16x4_t v1152 = *(const int16x4_t *)v875; - int16x4_t v1154 = *(const int16x4_t *)v884; - int16x4_t v1156 = *(const int16x4_t *)v893; - int16x4_t v1158 = *(const int16x4_t *)v902; - int16x4_t v1160 = *(const int16x4_t *)v911; - int16x4_t v1162 = *(const int16x4_t *)v920; - int16x4_t v1164 = *(const int16x4_t *)v929; - int16x4_t v1168 = *(const int16x4_t *)v947; - int16x4_t v1170 = *(const int16x4_t *)v956; - int16x4_t v1172 = *(const int16x4_t *)v965; - int16x4_t v1174 = *(const int16x4_t *)v974; - int16x4_t v1176 = *(const int16x4_t *)v983; - int16x4_t v1178 = *(const int16x4_t *)v992; - int16x4_t v1180 = *(const int16x4_t *)v1001; + int16x4_t v1152 = vld1_s16((const int16_t *)v875); + int16x4_t v1154 = vld1_s16((const int16_t *)v884); + int16x4_t v1156 = vld1_s16((const int16_t *)v893); + int16x4_t v1158 = vld1_s16((const int16_t *)v902); + int16x4_t v1160 = vld1_s16((const int16_t *)v911); + int16x4_t v1162 = vld1_s16((const int16_t *)v920); + int16x4_t v1164 = vld1_s16((const int16_t *)v929); + int16x4_t v1168 = vld1_s16((const int16_t *)v947); + int16x4_t v1170 = vld1_s16((const int16_t *)v956); + int16x4_t v1172 = vld1_s16((const int16_t *)v965); + int16x4_t v1174 = vld1_s16((const int16_t *)v974); + int16x4_t v1176 = vld1_s16((const int16_t *)v983); + int16x4_t v1178 = vld1_s16((const int16_t *)v992); + int16x4_t v1180 = vld1_s16((const int16_t *)v1001); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1152), 15); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v1154), 15); float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v1156), 15); @@ -7762,8 +7762,8 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu16(const armral_cmplx_int16_t *restrict x, float32x4_t v328 = vsubq_f32(v314, v316); float32x4_t v329 = vaddq_f32(v314, v322); float32x4_t v330 = vsubq_f32(v314, v322); - *(int16x4_t *)v1011 = v349; - *(int16x4_t *)v1083 = v413; + vst1_s16((int16_t *)v1011, v349); + vst1_s16((int16_t *)v1083, v413); float32x4_t v309 = vaddq_f32(v305, v306); float32x4_t v310 = vaddq_f32(v307, v308); float32x4_t v311 = vsubq_f32(v307, v308); @@ -7788,8 +7788,8 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu16(const armral_cmplx_int16_t *restrict x, int16x4_t v397 = vqmovn_s32(vcvtq_n_s32_f32(v311, 15)); int16x4_t v429 = vqmovn_s32(vcvtq_n_s32_f32(v310, 15)); int16x4_t v461 = vqmovn_s32(vcvtq_n_s32_f32(v309, 15)); - *(int16x4_t *)v1047 = v381; - *(int16x4_t *)v1119 = v445; + vst1_s16((int16_t *)v1047, v381); + vst1_s16((int16_t *)v1119, v445); int16x4_t v357 = vqmovn_s32(vcvtq_n_s32_f32(v342, 15)); int16x4_t v373 = vqmovn_s32(vcvtq_n_s32_f32(v345, 15)); int16x4_t v389 = vqmovn_s32(vcvtq_n_s32_f32(v346, 15)); @@ -7798,18 +7798,18 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu16(const armral_cmplx_int16_t *restrict x, int16x4_t v437 = vqmovn_s32(vcvtq_n_s32_f32(v343, 15)); int16x4_t v453 = vqmovn_s32(vcvtq_n_s32_f32(v344, 15)); int16x4_t v469 = vqmovn_s32(vcvtq_n_s32_f32(v339, 15)); - *(int16x4_t *)v1029 = v365; - *(int16x4_t *)v1065 = v397; - *(int16x4_t *)v1101 = v429; - *(int16x4_t *)v1137 = v461; - *(int16x4_t *)v1020 = v357; - *(int16x4_t *)v1038 = v373; - *(int16x4_t *)v1056 = v389; - *(int16x4_t *)v1074 = v405; - *(int16x4_t *)v1092 = v421; - *(int16x4_t *)v1110 = v437; - *(int16x4_t *)v1128 = v453; - *(int16x4_t *)v1146 = v469; + vst1_s16((int16_t *)v1029, v365); + vst1_s16((int16_t *)v1065, v397); + vst1_s16((int16_t *)v1101, v429); + vst1_s16((int16_t *)v1137, v461); + vst1_s16((int16_t *)v1020, v357); + vst1_s16((int16_t *)v1038, v373); + vst1_s16((int16_t *)v1056, v389); + vst1_s16((int16_t *)v1074, v405); + vst1_s16((int16_t *)v1092, v421); + vst1_s16((int16_t *)v1110, v437); + vst1_s16((int16_t *)v1128, v453); + vst1_s16((int16_t *)v1146, v469); v5 += 2 * 1; v6 += 2 * 1; } @@ -8599,7 +8599,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu17(const armral_cmplx_int16_t *restrict x, float32x2_t v461 = (float32x2_t){v459, v460}; const int32_t *v1424 = &v5[0]; int32_t *v1434 = &v6[0]; - int16x4_t v1582 = *(const int16x4_t *)v1279; + int16x4_t v1582 = vld1_s16((const int16_t *)v1279); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1582), 15); float32x4_t v232 = vcombine_f32(v231, v231); float32x4_t v237 = vcombine_f32(v236, v236); @@ -8666,7 +8666,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu17(const armral_cmplx_int16_t *restrict x, int32_t *v1560 = &v6[ostride * 10]; int32_t *v1569 = &v6[ostride * 8]; int32_t *v1578 = &v6[ostride * 9]; - int16x4_t v1614 = *(const int16x4_t *)v1424; + int16x4_t v1614 = vld1_s16((const int16_t *)v1424); float32x4_t v222 = vcvtq_n_f32_s32(vmovl_s16(v1614), 15); float32x4_t v305 = vcombine_f32(v303, v303); float32x4_t v313 = vcombine_f32(v311, v311); @@ -8689,21 +8689,21 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu17(const armral_cmplx_int16_t *restrict x, float32x4_t v449 = vcombine_f32(v447, v447); float32x4_t v457 = vcombine_f32(v455, v455); float32x4_t v465 = vcombine_f32(v463, v463); - int16x4_t v1584 = *(const int16x4_t *)v1288; - int16x4_t v1586 = *(const int16x4_t *)v1297; - int16x4_t v1588 = *(const int16x4_t *)v1306; - int16x4_t v1590 = *(const int16x4_t *)v1315; - int16x4_t v1592 = *(const int16x4_t *)v1324; - int16x4_t v1594 = *(const int16x4_t *)v1333; - int16x4_t v1596 = *(const int16x4_t *)v1342; - int16x4_t v1598 = *(const int16x4_t *)v1351; - int16x4_t v1600 = *(const int16x4_t *)v1360; - int16x4_t v1602 = *(const int16x4_t *)v1369; - int16x4_t v1604 = *(const int16x4_t *)v1378; - int16x4_t v1606 = *(const int16x4_t *)v1387; - int16x4_t v1608 = *(const int16x4_t *)v1396; - int16x4_t v1610 = *(const int16x4_t *)v1405; - int16x4_t v1612 = *(const int16x4_t *)v1414; + int16x4_t v1584 = vld1_s16((const int16_t *)v1288); + int16x4_t v1586 = vld1_s16((const int16_t *)v1297); + int16x4_t v1588 = vld1_s16((const int16_t *)v1306); + int16x4_t v1590 = vld1_s16((const int16_t *)v1315); + int16x4_t v1592 = vld1_s16((const int16_t *)v1324); + int16x4_t v1594 = vld1_s16((const int16_t *)v1333); + int16x4_t v1596 = vld1_s16((const int16_t *)v1342); + int16x4_t v1598 = vld1_s16((const int16_t *)v1351); + int16x4_t v1600 = vld1_s16((const int16_t *)v1360); + int16x4_t v1602 = vld1_s16((const int16_t *)v1369); + int16x4_t v1604 = vld1_s16((const int16_t *)v1378); + int16x4_t v1606 = vld1_s16((const int16_t *)v1387); + int16x4_t v1608 = vld1_s16((const int16_t *)v1396); + int16x4_t v1610 = vld1_s16((const int16_t *)v1405); + int16x4_t v1612 = vld1_s16((const int16_t *)v1414); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1584), 15); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v1586), 15); float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v1588), 15); @@ -8860,7 +8860,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu17(const armral_cmplx_int16_t *restrict x, float32x4_t v499 = vaddq_f32(v362, v370); float32x4_t v500 = vaddq_f32(v378, v394); float32x4_t v501 = vaddq_f32(v386, v394); - *(int16x4_t *)v1434 = v543; + vst1_s16((int16_t *)v1434, v543); float32x4_t v214 = vaddq_f32(v209, v213); float32x4_t v450 = vmulq_f32(v448, v449); float32x4_t v456 = vrev64q_f32(v213); @@ -8924,8 +8924,8 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu17(const armral_cmplx_int16_t *restrict x, float32x4_t v576 = vsubq_f32(v488, v527); float32x4_t v675 = vaddq_f32(v492, v538); float32x4_t v684 = vsubq_f32(v492, v538); - *(int16x4_t *)v1479 = v588; - *(int16x4_t *)v1488 = v597; + vst1_s16((int16_t *)v1479, v588); + vst1_s16((int16_t *)v1488, v597); float32x4_t v532 = vaddq_f32(v531, v513); float32x4_t v535 = vaddq_f32(v534, v520); float32x4_t v549 = vaddq_f32(v486, v523); @@ -8948,24 +8948,24 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu17(const armral_cmplx_int16_t *restrict x, int16x4_t v651 = vqmovn_s32(vcvtq_n_s32_f32(v648, 15)); float32x4_t v657 = vaddq_f32(v491, v535); float32x4_t v666 = vsubq_f32(v491, v535); - *(int16x4_t *)v1461 = v570; - *(int16x4_t *)v1470 = v579; - *(int16x4_t *)v1569 = v678; - *(int16x4_t *)v1578 = v687; + vst1_s16((int16_t *)v1461, v570); + vst1_s16((int16_t *)v1470, v579); + vst1_s16((int16_t *)v1569, v678); + vst1_s16((int16_t *)v1578, v687); int16x4_t v606 = vqmovn_s32(vcvtq_n_s32_f32(v603, 15)); int16x4_t v615 = vqmovn_s32(vcvtq_n_s32_f32(v612, 15)); int16x4_t v660 = vqmovn_s32(vcvtq_n_s32_f32(v657, 15)); int16x4_t v669 = vqmovn_s32(vcvtq_n_s32_f32(v666, 15)); - *(int16x4_t *)v1443 = v552; - *(int16x4_t *)v1452 = v561; - *(int16x4_t *)v1515 = v624; - *(int16x4_t *)v1524 = v633; - *(int16x4_t *)v1533 = v642; - *(int16x4_t *)v1542 = v651; - *(int16x4_t *)v1497 = v606; - *(int16x4_t *)v1506 = v615; - *(int16x4_t *)v1551 = v660; - *(int16x4_t *)v1560 = v669; + vst1_s16((int16_t *)v1443, v552); + vst1_s16((int16_t *)v1452, v561); + vst1_s16((int16_t *)v1515, v624); + vst1_s16((int16_t *)v1524, v633); + vst1_s16((int16_t *)v1533, v642); + vst1_s16((int16_t *)v1542, v651); + vst1_s16((int16_t *)v1497, v606); + vst1_s16((int16_t *)v1506, v615); + vst1_s16((int16_t *)v1551, v660); + vst1_s16((int16_t *)v1560, v669); v5 += 2 * 1; v6 += 2 * 1; } @@ -10111,7 +10111,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu18(const armral_cmplx_int16_t *restrict x, float32x2_t v382 = (float32x2_t){v380, v381}; const int32_t *v1018 = &v5[0]; int32_t *v1181 = &v6[0]; - int16x4_t v1360 = *(const int16x4_t *)v1117; + int16x4_t v1360 = vld1_s16((const int16_t *)v1117); float32x4_t v126 = vcvtq_n_f32_s32(vmovl_s16(v1360), 15); float32x4_t v326 = vcombine_f32(v325, v325); float32x4_t v339 = vcombine_f32(v338, v338); @@ -10154,28 +10154,28 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu18(const armral_cmplx_int16_t *restrict x, int32_t *v1316 = &v6[ostride * 7]; int32_t *v1325 = &v6[ostride * 8]; int32_t *v1334 = &v6[ostride * 17]; - int16x4_t v1338 = *(const int16x4_t *)v1018; + int16x4_t v1338 = vld1_s16((const int16_t *)v1018); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1338), 15); float32x4_t v347 = vcombine_f32(v345, v345); float32x4_t v370 = vcombine_f32(v368, v368); float32x4_t v378 = vcombine_f32(v376, v376); float32x4_t v386 = vcombine_f32(v384, v384); - int16x4_t v1340 = *(const int16x4_t *)v1027; - int16x4_t v1342 = *(const int16x4_t *)v1036; - int16x4_t v1344 = *(const int16x4_t *)v1045; - int16x4_t v1346 = *(const int16x4_t *)v1054; - int16x4_t v1348 = *(const int16x4_t *)v1063; - int16x4_t v1350 = *(const int16x4_t *)v1072; - int16x4_t v1352 = *(const int16x4_t *)v1081; - int16x4_t v1354 = *(const int16x4_t *)v1090; - int16x4_t v1356 = *(const int16x4_t *)v1099; - int16x4_t v1358 = *(const int16x4_t *)v1108; - int16x4_t v1362 = *(const int16x4_t *)v1126; - int16x4_t v1364 = *(const int16x4_t *)v1135; - int16x4_t v1366 = *(const int16x4_t *)v1144; - int16x4_t v1368 = *(const int16x4_t *)v1153; - int16x4_t v1370 = *(const int16x4_t *)v1162; - int16x4_t v1372 = *(const int16x4_t *)v1171; + int16x4_t v1340 = vld1_s16((const int16_t *)v1027); + int16x4_t v1342 = vld1_s16((const int16_t *)v1036); + int16x4_t v1344 = vld1_s16((const int16_t *)v1045); + int16x4_t v1346 = vld1_s16((const int16_t *)v1054); + int16x4_t v1348 = vld1_s16((const int16_t *)v1063); + int16x4_t v1350 = vld1_s16((const int16_t *)v1072); + int16x4_t v1352 = vld1_s16((const int16_t *)v1081); + int16x4_t v1354 = vld1_s16((const int16_t *)v1090); + int16x4_t v1356 = vld1_s16((const int16_t *)v1099); + int16x4_t v1358 = vld1_s16((const int16_t *)v1108); + int16x4_t v1362 = vld1_s16((const int16_t *)v1126); + int16x4_t v1364 = vld1_s16((const int16_t *)v1135); + int16x4_t v1366 = vld1_s16((const int16_t *)v1144); + int16x4_t v1368 = vld1_s16((const int16_t *)v1153); + int16x4_t v1370 = vld1_s16((const int16_t *)v1162); + int16x4_t v1372 = vld1_s16((const int16_t *)v1171); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1340), 15); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v1342), 15); float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v1344), 15); @@ -10304,8 +10304,8 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu18(const armral_cmplx_int16_t *restrict x, float32x4_t v279 = vaddq_f32(v278, v273); float32x4_t v390 = vaddq_f32(v309, v389); float32x4_t v394 = vaddq_f32(v393, v388); - *(int16x4_t *)v1181 = v415; - *(int16x4_t *)v1190 = v423; + vst1_s16((int16_t *)v1181, v415); + vst1_s16((int16_t *)v1190, v423); float32x4_t v276 = vaddq_f32(v275, v220); float32x4_t v277 = vsubq_f32(v275, v220); float32x4_t v280 = vaddq_f32(v279, v238); @@ -10338,10 +10338,10 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu18(const armral_cmplx_int16_t *restrict x, float32x4_t v410 = vsubq_f32(v398, v404); float32x4_t v411 = vaddq_f32(v400, v406); float32x4_t v412 = vsubq_f32(v400, v406); - *(int16x4_t *)v1235 = v463; - *(int16x4_t *)v1244 = v471; - *(int16x4_t *)v1289 = v511; - *(int16x4_t *)v1298 = v519; + vst1_s16((int16_t *)v1235, v463); + vst1_s16((int16_t *)v1244, v471); + vst1_s16((int16_t *)v1289, v511); + vst1_s16((int16_t *)v1298, v519); int16x4_t v431 = vqmovn_s32(vcvtq_n_s32_f32(v293, 15)); int16x4_t v439 = vqmovn_s32(vcvtq_n_s32_f32(v408, 15)); int16x4_t v447 = vqmovn_s32(vcvtq_n_s32_f32(v294, 15)); @@ -10354,18 +10354,18 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu18(const armral_cmplx_int16_t *restrict x, int16x4_t v535 = vqmovn_s32(vcvtq_n_s32_f32(v410, 15)); int16x4_t v543 = vqmovn_s32(vcvtq_n_s32_f32(v292, 15)); int16x4_t v551 = vqmovn_s32(vcvtq_n_s32_f32(v407, 15)); - *(int16x4_t *)v1199 = v431; - *(int16x4_t *)v1208 = v439; - *(int16x4_t *)v1217 = v447; - *(int16x4_t *)v1226 = v455; - *(int16x4_t *)v1253 = v479; - *(int16x4_t *)v1262 = v487; - *(int16x4_t *)v1271 = v495; - *(int16x4_t *)v1280 = v503; - *(int16x4_t *)v1307 = v527; - *(int16x4_t *)v1316 = v535; - *(int16x4_t *)v1325 = v543; - *(int16x4_t *)v1334 = v551; + vst1_s16((int16_t *)v1199, v431); + vst1_s16((int16_t *)v1208, v439); + vst1_s16((int16_t *)v1217, v447); + vst1_s16((int16_t *)v1226, v455); + vst1_s16((int16_t *)v1253, v479); + vst1_s16((int16_t *)v1262, v487); + vst1_s16((int16_t *)v1271, v495); + vst1_s16((int16_t *)v1280, v503); + vst1_s16((int16_t *)v1307, v527); + vst1_s16((int16_t *)v1316, v535); + vst1_s16((int16_t *)v1325, v543); + vst1_s16((int16_t *)v1334, v551); v5 += 2 * 1; v6 += 2 * 1; } @@ -11286,7 +11286,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu19(const armral_cmplx_int16_t *restrict x, float32x2_t v502 = (float32x2_t){v500, v501}; const int32_t *v1571 = &v5[0]; int32_t *v1581 = &v6[0]; - int16x4_t v1747 = *(const int16x4_t *)v1408; + int16x4_t v1747 = vld1_s16((const int16_t *)v1408); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1747), 15); float32x4_t v264 = vcombine_f32(v263, v263); float32x4_t v269 = vcombine_f32(v268, v268); @@ -11360,7 +11360,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu19(const armral_cmplx_int16_t *restrict x, int32_t *v1725 = &v6[ostride * 11]; int32_t *v1734 = &v6[ostride * 9]; int32_t *v1743 = &v6[ostride * 10]; - int16x4_t v1783 = *(const int16x4_t *)v1571; + int16x4_t v1783 = vld1_s16((const int16_t *)v1571); float32x4_t v206 = vcvtq_n_f32_s32(vmovl_s16(v1783), 15); float32x4_t v362 = vcombine_f32(v360, v360); float32x4_t v370 = vcombine_f32(v368, v368); @@ -11381,23 +11381,23 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu19(const armral_cmplx_int16_t *restrict x, float32x4_t v490 = vcombine_f32(v488, v488); float32x4_t v498 = vcombine_f32(v496, v496); float32x4_t v506 = vcombine_f32(v504, v504); - int16x4_t v1749 = *(const int16x4_t *)v1417; - int16x4_t v1751 = *(const int16x4_t *)v1426; - int16x4_t v1753 = *(const int16x4_t *)v1435; - int16x4_t v1755 = *(const int16x4_t *)v1444; - int16x4_t v1757 = *(const int16x4_t *)v1453; - int16x4_t v1759 = *(const int16x4_t *)v1462; - int16x4_t v1761 = *(const int16x4_t *)v1471; - int16x4_t v1763 = *(const int16x4_t *)v1480; - int16x4_t v1765 = *(const int16x4_t *)v1489; - int16x4_t v1767 = *(const int16x4_t *)v1498; - int16x4_t v1769 = *(const int16x4_t *)v1507; - int16x4_t v1771 = *(const int16x4_t *)v1516; - int16x4_t v1773 = *(const int16x4_t *)v1525; - int16x4_t v1775 = *(const int16x4_t *)v1534; - int16x4_t v1777 = *(const int16x4_t *)v1543; - int16x4_t v1779 = *(const int16x4_t *)v1552; - int16x4_t v1781 = *(const int16x4_t *)v1561; + int16x4_t v1749 = vld1_s16((const int16_t *)v1417); + int16x4_t v1751 = vld1_s16((const int16_t *)v1426); + int16x4_t v1753 = vld1_s16((const int16_t *)v1435); + int16x4_t v1755 = vld1_s16((const int16_t *)v1444); + int16x4_t v1757 = vld1_s16((const int16_t *)v1453); + int16x4_t v1759 = vld1_s16((const int16_t *)v1462); + int16x4_t v1761 = vld1_s16((const int16_t *)v1471); + int16x4_t v1763 = vld1_s16((const int16_t *)v1480); + int16x4_t v1765 = vld1_s16((const int16_t *)v1489); + int16x4_t v1767 = vld1_s16((const int16_t *)v1498); + int16x4_t v1769 = vld1_s16((const int16_t *)v1507); + int16x4_t v1771 = vld1_s16((const int16_t *)v1516); + int16x4_t v1773 = vld1_s16((const int16_t *)v1525); + int16x4_t v1775 = vld1_s16((const int16_t *)v1534); + int16x4_t v1777 = vld1_s16((const int16_t *)v1543); + int16x4_t v1779 = vld1_s16((const int16_t *)v1552); + int16x4_t v1781 = vld1_s16((const int16_t *)v1561); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1749), 15); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v1751), 15); float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v1753), 15); @@ -11580,7 +11580,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu19(const armral_cmplx_int16_t *restrict x, float32x4_t v561 = vsubq_f32(v539, v540); float32x4_t v563 = vsubq_f32(v491, v507); float32x4_t v564 = vsubq_f32(v499, v507); - *(int16x4_t *)v1581 = v595; + vst1_s16((int16_t *)v1581, v595); float32x4_t v517 = vsubq_f32(v315, v514); float32x4_t v518 = vaddq_f32(v305, v510); float32x4_t v520 = vaddq_f32(v516, v320); @@ -11664,10 +11664,10 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu19(const armral_cmplx_int16_t *restrict x, float32x4_t v718 = vsubq_f32(v575, v587); int16x4_t v730 = vqmovn_s32(vcvtq_n_s32_f32(v727, 15)); int16x4_t v739 = vqmovn_s32(vcvtq_n_s32_f32(v736, 15)); - *(int16x4_t *)v1608 = v622; - *(int16x4_t *)v1617 = v631; - *(int16x4_t *)v1626 = v640; - *(int16x4_t *)v1635 = v649; + vst1_s16((int16_t *)v1608, v622); + vst1_s16((int16_t *)v1617, v631); + vst1_s16((int16_t *)v1626, v640); + vst1_s16((int16_t *)v1635, v649); float32x4_t v601 = vaddq_f32(v570, v582); float32x4_t v610 = vsubq_f32(v570, v582); int16x4_t v694 = vqmovn_s32(vcvtq_n_s32_f32(v691, 15)); @@ -11676,24 +11676,24 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu19(const armral_cmplx_int16_t *restrict x, int16x4_t v721 = vqmovn_s32(vcvtq_n_s32_f32(v718, 15)); float32x4_t v745 = vaddq_f32(v572, v584); float32x4_t v754 = vsubq_f32(v572, v584); - *(int16x4_t *)v1644 = v658; - *(int16x4_t *)v1653 = v667; - *(int16x4_t *)v1662 = v676; - *(int16x4_t *)v1671 = v685; - *(int16x4_t *)v1716 = v730; - *(int16x4_t *)v1725 = v739; + vst1_s16((int16_t *)v1644, v658); + vst1_s16((int16_t *)v1653, v667); + vst1_s16((int16_t *)v1662, v676); + vst1_s16((int16_t *)v1671, v685); + vst1_s16((int16_t *)v1716, v730); + vst1_s16((int16_t *)v1725, v739); int16x4_t v604 = vqmovn_s32(vcvtq_n_s32_f32(v601, 15)); int16x4_t v613 = vqmovn_s32(vcvtq_n_s32_f32(v610, 15)); int16x4_t v748 = vqmovn_s32(vcvtq_n_s32_f32(v745, 15)); int16x4_t v757 = vqmovn_s32(vcvtq_n_s32_f32(v754, 15)); - *(int16x4_t *)v1680 = v694; - *(int16x4_t *)v1689 = v703; - *(int16x4_t *)v1698 = v712; - *(int16x4_t *)v1707 = v721; - *(int16x4_t *)v1590 = v604; - *(int16x4_t *)v1599 = v613; - *(int16x4_t *)v1734 = v748; - *(int16x4_t *)v1743 = v757; + vst1_s16((int16_t *)v1680, v694); + vst1_s16((int16_t *)v1689, v703); + vst1_s16((int16_t *)v1698, v712); + vst1_s16((int16_t *)v1707, v721); + vst1_s16((int16_t *)v1590, v604); + vst1_s16((int16_t *)v1599, v613); + vst1_s16((int16_t *)v1734, v748); + vst1_s16((int16_t *)v1743, v757); v5 += 2 * 1; v6 += 2 * 1; } @@ -12978,7 +12978,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu20(const armral_cmplx_int16_t *restrict x, float32x2_t v423 = (float32x2_t){v422, v422}; const int32_t *v1104 = &v5[0]; int32_t *v1285 = &v6[0]; - int16x4_t v1496 = *(const int16x4_t *)v1266; + int16x4_t v1496 = vld1_s16((const int16_t *)v1266); float32x4_t v198 = vcvtq_n_f32_s32(vmovl_s16(v1496), 15); float32x4_t v339 = vcombine_f32(v338, v338); float32x4_t v344 = vcombine_f32(v343, v343); @@ -13027,7 +13027,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu20(const armral_cmplx_int16_t *restrict x, int32_t *v1438 = &v6[ostride * 9]; int32_t *v1447 = &v6[ostride * 14]; int32_t *v1456 = &v6[ostride * 19]; - int16x4_t v1460 = *(const int16x4_t *)v1104; + int16x4_t v1460 = vld1_s16((const int16_t *)v1104); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1460), 15); float32x4_t v352 = vcombine_f32(v350, v350); float32x4_t v360 = vcombine_f32(v358, v358); @@ -13035,24 +13035,24 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu20(const armral_cmplx_int16_t *restrict x, float32x4_t v393 = vcombine_f32(v391, v391); float32x4_t v401 = vcombine_f32(v399, v399); float32x4_t v409 = vcombine_f32(v407, v407); - int16x4_t v1462 = *(const int16x4_t *)v1113; - int16x4_t v1464 = *(const int16x4_t *)v1122; - int16x4_t v1466 = *(const int16x4_t *)v1131; - int16x4_t v1468 = *(const int16x4_t *)v1140; - int16x4_t v1470 = *(const int16x4_t *)v1149; - int16x4_t v1472 = *(const int16x4_t *)v1158; - int16x4_t v1474 = *(const int16x4_t *)v1167; - int16x4_t v1476 = *(const int16x4_t *)v1176; - int16x4_t v1478 = *(const int16x4_t *)v1185; - int16x4_t v1480 = *(const int16x4_t *)v1194; - int16x4_t v1482 = *(const int16x4_t *)v1203; - int16x4_t v1484 = *(const int16x4_t *)v1212; - int16x4_t v1486 = *(const int16x4_t *)v1221; - int16x4_t v1488 = *(const int16x4_t *)v1230; - int16x4_t v1490 = *(const int16x4_t *)v1239; - int16x4_t v1492 = *(const int16x4_t *)v1248; - int16x4_t v1494 = *(const int16x4_t *)v1257; - int16x4_t v1498 = *(const int16x4_t *)v1275; + int16x4_t v1462 = vld1_s16((const int16_t *)v1113); + int16x4_t v1464 = vld1_s16((const int16_t *)v1122); + int16x4_t v1466 = vld1_s16((const int16_t *)v1131); + int16x4_t v1468 = vld1_s16((const int16_t *)v1140); + int16x4_t v1470 = vld1_s16((const int16_t *)v1149); + int16x4_t v1472 = vld1_s16((const int16_t *)v1158); + int16x4_t v1474 = vld1_s16((const int16_t *)v1167); + int16x4_t v1476 = vld1_s16((const int16_t *)v1176); + int16x4_t v1478 = vld1_s16((const int16_t *)v1185); + int16x4_t v1480 = vld1_s16((const int16_t *)v1194); + int16x4_t v1482 = vld1_s16((const int16_t *)v1203); + int16x4_t v1484 = vld1_s16((const int16_t *)v1212); + int16x4_t v1486 = vld1_s16((const int16_t *)v1221); + int16x4_t v1488 = vld1_s16((const int16_t *)v1230); + int16x4_t v1490 = vld1_s16((const int16_t *)v1239); + int16x4_t v1492 = vld1_s16((const int16_t *)v1248); + int16x4_t v1494 = vld1_s16((const int16_t *)v1257); + int16x4_t v1498 = vld1_s16((const int16_t *)v1275); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1462), 15); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v1464), 15); float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v1466), 15); @@ -13192,8 +13192,8 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu20(const armral_cmplx_int16_t *restrict x, float32x4_t v426 = vaddq_f32(v394, v402); float32x4_t v435 = vaddq_f32(v330, v394); float32x4_t v436 = vsubq_f32(v330, v394); - *(int16x4_t *)v1285 = v439; - *(int16x4_t *)v1303 = v455; + vst1_s16((int16_t *)v1285, v439); + vst1_s16((int16_t *)v1303, v455); float32x4_t v263 = vaddq_f32(v259, v261); float32x4_t v264 = vsubq_f32(v259, v261); float32x4_t v265 = vaddq_f32(v260, v262); @@ -13218,8 +13218,8 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu20(const armral_cmplx_int16_t *restrict x, int16x4_t v557 = vqmovn_s32(vcvtq_n_s32_f32(v321, 15)); int16x4_t v575 = vqmovn_s32(vcvtq_n_s32_f32(v263, 15)); int16x4_t v591 = vqmovn_s32(vcvtq_n_s32_f32(v319, 15)); - *(int16x4_t *)v1294 = v447; - *(int16x4_t *)v1312 = v463; + vst1_s16((int16_t *)v1294, v447); + vst1_s16((int16_t *)v1312, v463); float32x4_t v469 = vaddq_f32(v376, v432); float32x4_t v470 = vsubq_f32(v376, v432); float32x4_t v503 = vaddq_f32(v378, v434); @@ -13228,14 +13228,14 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu20(const armral_cmplx_int16_t *restrict x, float32x4_t v538 = vsubq_f32(v377, v433); float32x4_t v571 = vaddq_f32(v375, v431); float32x4_t v572 = vsubq_f32(v375, v431); - *(int16x4_t *)v1321 = v473; - *(int16x4_t *)v1339 = v489; - *(int16x4_t *)v1357 = v507; - *(int16x4_t *)v1375 = v523; - *(int16x4_t *)v1393 = v541; - *(int16x4_t *)v1411 = v557; - *(int16x4_t *)v1429 = v575; - *(int16x4_t *)v1447 = v591; + vst1_s16((int16_t *)v1321, v473); + vst1_s16((int16_t *)v1339, v489); + vst1_s16((int16_t *)v1357, v507); + vst1_s16((int16_t *)v1375, v523); + vst1_s16((int16_t *)v1393, v541); + vst1_s16((int16_t *)v1411, v557); + vst1_s16((int16_t *)v1429, v575); + vst1_s16((int16_t *)v1447, v591); int16x4_t v481 = vqmovn_s32(vcvtq_n_s32_f32(v470, 15)); int16x4_t v497 = vqmovn_s32(vcvtq_n_s32_f32(v469, 15)); int16x4_t v515 = vqmovn_s32(vcvtq_n_s32_f32(v504, 15)); @@ -13244,14 +13244,14 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu20(const armral_cmplx_int16_t *restrict x, int16x4_t v565 = vqmovn_s32(vcvtq_n_s32_f32(v537, 15)); int16x4_t v583 = vqmovn_s32(vcvtq_n_s32_f32(v572, 15)); int16x4_t v599 = vqmovn_s32(vcvtq_n_s32_f32(v571, 15)); - *(int16x4_t *)v1330 = v481; - *(int16x4_t *)v1348 = v497; - *(int16x4_t *)v1366 = v515; - *(int16x4_t *)v1384 = v531; - *(int16x4_t *)v1402 = v549; - *(int16x4_t *)v1420 = v565; - *(int16x4_t *)v1438 = v583; - *(int16x4_t *)v1456 = v599; + vst1_s16((int16_t *)v1330, v481); + vst1_s16((int16_t *)v1348, v497); + vst1_s16((int16_t *)v1366, v515); + vst1_s16((int16_t *)v1384, v531); + vst1_s16((int16_t *)v1402, v549); + vst1_s16((int16_t *)v1420, v565); + vst1_s16((int16_t *)v1438, v583); + vst1_s16((int16_t *)v1456, v599); v5 += 2 * 1; v6 += 2 * 1; } @@ -14186,7 +14186,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu21(const armral_cmplx_int16_t *restrict x, float32x2_t v470 = (float32x2_t){v469, v469}; const int32_t *v1267 = &v5[0]; int32_t *v1439 = &v6[0]; - int16x4_t v1653 = *(const int16x4_t *)v1384; + int16x4_t v1653 = vld1_s16((const int16_t *)v1384); float32x4_t v163 = vcvtq_n_f32_s32(vmovl_s16(v1653), 15); float32x4_t v235 = vcombine_f32(v234, v234); float32x4_t v240 = vcombine_f32(v239, v239); @@ -14252,7 +14252,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu21(const armral_cmplx_int16_t *restrict x, int32_t *v1601 = &v6[ostride * 6]; int32_t *v1610 = &v6[ostride * 13]; int32_t *v1619 = &v6[ostride * 20]; - int16x4_t v1627 = *(const int16x4_t *)v1267; + int16x4_t v1627 = vld1_s16((const int16_t *)v1267); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v1627), 15); float32x4_t v258 = vcombine_f32(v256, v256); float32x4_t v266 = vcombine_f32(v264, v264); @@ -14267,25 +14267,25 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu21(const armral_cmplx_int16_t *restrict x, float32x4_t v435 = vcombine_f32(v433, v433); float32x4_t v443 = vcombine_f32(v441, v441); float32x4_t v451 = vcombine_f32(v449, v449); - int16x4_t v1623 = *(const int16x4_t *)v1248; - int16x4_t v1625 = *(const int16x4_t *)v1257; - int16x4_t v1629 = *(const int16x4_t *)v1276; - int16x4_t v1631 = *(const int16x4_t *)v1285; - int16x4_t v1633 = *(const int16x4_t *)v1294; - int16x4_t v1635 = *(const int16x4_t *)v1303; - int16x4_t v1637 = *(const int16x4_t *)v1312; - int16x4_t v1639 = *(const int16x4_t *)v1321; - int16x4_t v1641 = *(const int16x4_t *)v1330; - int16x4_t v1643 = *(const int16x4_t *)v1339; - int16x4_t v1645 = *(const int16x4_t *)v1348; - int16x4_t v1647 = *(const int16x4_t *)v1357; - int16x4_t v1649 = *(const int16x4_t *)v1366; - int16x4_t v1651 = *(const int16x4_t *)v1375; - int16x4_t v1655 = *(const int16x4_t *)v1393; - int16x4_t v1657 = *(const int16x4_t *)v1402; - int16x4_t v1659 = *(const int16x4_t *)v1411; - int16x4_t v1661 = *(const int16x4_t *)v1420; - int16x4_t v1663 = *(const int16x4_t *)v1429; + int16x4_t v1623 = vld1_s16((const int16_t *)v1248); + int16x4_t v1625 = vld1_s16((const int16_t *)v1257); + int16x4_t v1629 = vld1_s16((const int16_t *)v1276); + int16x4_t v1631 = vld1_s16((const int16_t *)v1285); + int16x4_t v1633 = vld1_s16((const int16_t *)v1294); + int16x4_t v1635 = vld1_s16((const int16_t *)v1303); + int16x4_t v1637 = vld1_s16((const int16_t *)v1312); + int16x4_t v1639 = vld1_s16((const int16_t *)v1321); + int16x4_t v1641 = vld1_s16((const int16_t *)v1330); + int16x4_t v1643 = vld1_s16((const int16_t *)v1339); + int16x4_t v1645 = vld1_s16((const int16_t *)v1348); + int16x4_t v1647 = vld1_s16((const int16_t *)v1357); + int16x4_t v1649 = vld1_s16((const int16_t *)v1366); + int16x4_t v1651 = vld1_s16((const int16_t *)v1375); + int16x4_t v1655 = vld1_s16((const int16_t *)v1393); + int16x4_t v1657 = vld1_s16((const int16_t *)v1402); + int16x4_t v1659 = vld1_s16((const int16_t *)v1411); + int16x4_t v1661 = vld1_s16((const int16_t *)v1420); + int16x4_t v1663 = vld1_s16((const int16_t *)v1429); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1623), 15); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1625), 15); float32x4_t v55 = vcvtq_n_f32_s32(vmovl_s16(v1629), 15); @@ -14444,7 +14444,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu21(const armral_cmplx_int16_t *restrict x, float32x4_t v473 = vaddq_f32(v420, v428); float32x4_t v493 = vaddq_f32(v492, v420); float32x4_t v494 = vsubq_f32(v492, v420); - *(int16x4_t *)v1439 = v497; + vst1_s16((int16_t *)v1439, v497); float32x4_t v286 = vaddq_f32(v285, v246); float32x4_t v288 = vsubq_f32(v287, v251); float32x4_t v290 = vaddq_f32(v289, v251); @@ -14474,8 +14474,8 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu21(const armral_cmplx_int16_t *restrict x, float32x4_t v475 = vaddq_f32(v474, v444); float32x4_t v477 = vsubq_f32(v476, v452); float32x4_t v479 = vaddq_f32(v478, v452); - *(int16x4_t *)v1448 = v505; - *(int16x4_t *)v1457 = v513; + vst1_s16((int16_t *)v1448, v505); + vst1_s16((int16_t *)v1457, v513); float32x4_t v486 = vaddq_f32(v475, v481); float32x4_t v487 = vsubq_f32(v475, v481); float32x4_t v488 = vaddq_f32(v477, v483); @@ -14506,12 +14506,12 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu21(const armral_cmplx_int16_t *restrict x, float32x4_t v629 = vsubq_f32(v627, v488); float32x4_t v655 = vaddq_f32(v654, v486); float32x4_t v656 = vsubq_f32(v654, v486); - *(int16x4_t *)v1466 = v524; - *(int16x4_t *)v1493 = v551; - *(int16x4_t *)v1520 = v578; - *(int16x4_t *)v1547 = v605; - *(int16x4_t *)v1574 = v632; - *(int16x4_t *)v1601 = v659; + vst1_s16((int16_t *)v1466, v524); + vst1_s16((int16_t *)v1493, v551); + vst1_s16((int16_t *)v1520, v578); + vst1_s16((int16_t *)v1547, v605); + vst1_s16((int16_t *)v1574, v632); + vst1_s16((int16_t *)v1601, v659); int16x4_t v532 = vqmovn_s32(vcvtq_n_s32_f32(v521, 15)); int16x4_t v540 = vqmovn_s32(vcvtq_n_s32_f32(v520, 15)); int16x4_t v559 = vqmovn_s32(vcvtq_n_s32_f32(v548, 15)); @@ -14524,18 +14524,18 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu21(const armral_cmplx_int16_t *restrict x, int16x4_t v648 = vqmovn_s32(vcvtq_n_s32_f32(v628, 15)); int16x4_t v667 = vqmovn_s32(vcvtq_n_s32_f32(v656, 15)); int16x4_t v675 = vqmovn_s32(vcvtq_n_s32_f32(v655, 15)); - *(int16x4_t *)v1475 = v532; - *(int16x4_t *)v1484 = v540; - *(int16x4_t *)v1502 = v559; - *(int16x4_t *)v1511 = v567; - *(int16x4_t *)v1529 = v586; - *(int16x4_t *)v1538 = v594; - *(int16x4_t *)v1556 = v613; - *(int16x4_t *)v1565 = v621; - *(int16x4_t *)v1583 = v640; - *(int16x4_t *)v1592 = v648; - *(int16x4_t *)v1610 = v667; - *(int16x4_t *)v1619 = v675; + vst1_s16((int16_t *)v1475, v532); + vst1_s16((int16_t *)v1484, v540); + vst1_s16((int16_t *)v1502, v559); + vst1_s16((int16_t *)v1511, v567); + vst1_s16((int16_t *)v1529, v586); + vst1_s16((int16_t *)v1538, v594); + vst1_s16((int16_t *)v1556, v613); + vst1_s16((int16_t *)v1565, v621); + vst1_s16((int16_t *)v1583, v640); + vst1_s16((int16_t *)v1592, v648); + vst1_s16((int16_t *)v1610, v667); + vst1_s16((int16_t *)v1619, v675); v5 += 2 * 1; v6 += 2 * 1; } @@ -15703,7 +15703,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu22(const armral_cmplx_int16_t *restrict x, float32x2_t v604 = (float32x2_t){v602, v603}; const int32_t *v1534 = &v5[0]; int32_t *v1733 = &v6[0]; - int16x4_t v1952 = *(const int16x4_t *)v1651; + int16x4_t v1952 = vld1_s16((const int16_t *)v1651); float32x4_t v144 = vcvtq_n_f32_s32(vmovl_s16(v1952), 15); float32x4_t v483 = vcombine_f32(v482, v482); float32x2_t v489 = vmul_f32(v605, v487); @@ -15765,7 +15765,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu22(const armral_cmplx_int16_t *restrict x, int32_t *v1904 = &v6[ostride * 9]; int32_t *v1913 = &v6[ostride * 10]; int32_t *v1922 = &v6[ostride * 21]; - int16x4_t v1926 = *(const int16x4_t *)v1534; + int16x4_t v1926 = vld1_s16((const int16_t *)v1534); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1926), 15); float32x4_t v491 = vcombine_f32(v489, v489); float32x4_t v544 = vcombine_f32(v542, v542); @@ -15777,26 +15777,26 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu22(const armral_cmplx_int16_t *restrict x, float32x4_t v592 = vcombine_f32(v590, v590); float32x4_t v600 = vcombine_f32(v598, v598); float32x4_t v608 = vcombine_f32(v606, v606); - int16x4_t v1928 = *(const int16x4_t *)v1543; - int16x4_t v1930 = *(const int16x4_t *)v1552; - int16x4_t v1932 = *(const int16x4_t *)v1561; - int16x4_t v1934 = *(const int16x4_t *)v1570; - int16x4_t v1936 = *(const int16x4_t *)v1579; - int16x4_t v1938 = *(const int16x4_t *)v1588; - int16x4_t v1940 = *(const int16x4_t *)v1597; - int16x4_t v1942 = *(const int16x4_t *)v1606; - int16x4_t v1944 = *(const int16x4_t *)v1615; - int16x4_t v1946 = *(const int16x4_t *)v1624; - int16x4_t v1948 = *(const int16x4_t *)v1633; - int16x4_t v1950 = *(const int16x4_t *)v1642; - int16x4_t v1954 = *(const int16x4_t *)v1660; - int16x4_t v1956 = *(const int16x4_t *)v1669; - int16x4_t v1958 = *(const int16x4_t *)v1678; - int16x4_t v1960 = *(const int16x4_t *)v1687; - int16x4_t v1962 = *(const int16x4_t *)v1696; - int16x4_t v1964 = *(const int16x4_t *)v1705; - int16x4_t v1966 = *(const int16x4_t *)v1714; - int16x4_t v1968 = *(const int16x4_t *)v1723; + int16x4_t v1928 = vld1_s16((const int16_t *)v1543); + int16x4_t v1930 = vld1_s16((const int16_t *)v1552); + int16x4_t v1932 = vld1_s16((const int16_t *)v1561); + int16x4_t v1934 = vld1_s16((const int16_t *)v1570); + int16x4_t v1936 = vld1_s16((const int16_t *)v1579); + int16x4_t v1938 = vld1_s16((const int16_t *)v1588); + int16x4_t v1940 = vld1_s16((const int16_t *)v1597); + int16x4_t v1942 = vld1_s16((const int16_t *)v1606); + int16x4_t v1944 = vld1_s16((const int16_t *)v1615); + int16x4_t v1946 = vld1_s16((const int16_t *)v1624); + int16x4_t v1948 = vld1_s16((const int16_t *)v1633); + int16x4_t v1950 = vld1_s16((const int16_t *)v1642); + int16x4_t v1954 = vld1_s16((const int16_t *)v1660); + int16x4_t v1956 = vld1_s16((const int16_t *)v1669); + int16x4_t v1958 = vld1_s16((const int16_t *)v1678); + int16x4_t v1960 = vld1_s16((const int16_t *)v1687); + int16x4_t v1962 = vld1_s16((const int16_t *)v1696); + int16x4_t v1964 = vld1_s16((const int16_t *)v1705); + int16x4_t v1966 = vld1_s16((const int16_t *)v1714); + int16x4_t v1968 = vld1_s16((const int16_t *)v1723); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1928), 15); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v1930), 15); float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v1932), 15); @@ -16029,8 +16029,8 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu22(const armral_cmplx_int16_t *restrict x, float32x4_t v639 = vaddq_f32(v492, v624); float32x4_t v641 = vsubq_f32(v624, v620); float32x4_t v644 = vaddq_f32(v643, v621); - *(int16x4_t *)v1733 = v659; - *(int16x4_t *)v1742 = v667; + vst1_s16((int16_t *)v1733, v659); + vst1_s16((int16_t *)v1742, v667); float32x4_t v409 = vsubq_f32(v408, v398); float32x4_t v411 = vaddq_f32(v410, v399); float32x4_t v413 = vsubq_f32(v412, v399); @@ -16091,26 +16091,26 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu22(const armral_cmplx_int16_t *restrict x, int16x4_t v683 = vqmovn_s32(vcvtq_n_s32_f32(v656, 15)); int16x4_t v819 = vqmovn_s32(vcvtq_n_s32_f32(v428, 15)); int16x4_t v827 = vqmovn_s32(vcvtq_n_s32_f32(v647, 15)); - *(int16x4_t *)v1769 = v691; - *(int16x4_t *)v1778 = v699; - *(int16x4_t *)v1787 = v707; - *(int16x4_t *)v1796 = v715; - *(int16x4_t *)v1805 = v723; - *(int16x4_t *)v1814 = v731; - *(int16x4_t *)v1823 = v739; - *(int16x4_t *)v1832 = v747; - *(int16x4_t *)v1841 = v755; - *(int16x4_t *)v1850 = v763; - *(int16x4_t *)v1859 = v771; - *(int16x4_t *)v1868 = v779; - *(int16x4_t *)v1877 = v787; - *(int16x4_t *)v1886 = v795; - *(int16x4_t *)v1895 = v803; - *(int16x4_t *)v1904 = v811; - *(int16x4_t *)v1751 = v675; - *(int16x4_t *)v1760 = v683; - *(int16x4_t *)v1913 = v819; - *(int16x4_t *)v1922 = v827; + vst1_s16((int16_t *)v1769, v691); + vst1_s16((int16_t *)v1778, v699); + vst1_s16((int16_t *)v1787, v707); + vst1_s16((int16_t *)v1796, v715); + vst1_s16((int16_t *)v1805, v723); + vst1_s16((int16_t *)v1814, v731); + vst1_s16((int16_t *)v1823, v739); + vst1_s16((int16_t *)v1832, v747); + vst1_s16((int16_t *)v1841, v755); + vst1_s16((int16_t *)v1850, v763); + vst1_s16((int16_t *)v1859, v771); + vst1_s16((int16_t *)v1868, v779); + vst1_s16((int16_t *)v1877, v787); + vst1_s16((int16_t *)v1886, v795); + vst1_s16((int16_t *)v1895, v803); + vst1_s16((int16_t *)v1904, v811); + vst1_s16((int16_t *)v1751, v675); + vst1_s16((int16_t *)v1760, v683); + vst1_s16((int16_t *)v1913, v819); + vst1_s16((int16_t *)v1922, v827); v5 += 2 * 1; v6 += 2 * 1; } @@ -17397,7 +17397,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu24(const armral_cmplx_int16_t *restrict x, float32x2_t v452 = (float32x2_t){v450, v451}; const int32_t *v1264 = &v5[0]; int32_t *v1463 = &v6[0]; - int16x4_t v1694 = *(const int16x4_t *)v1336; + int16x4_t v1694 = vld1_s16((const int16_t *)v1336); float32x4_t v117 = vcvtq_n_f32_s32(vmovl_s16(v1694), 15); float32x2_t v285 = vmul_f32(v453, v283); float32x2_t v293 = vmul_f32(v453, v291); @@ -17454,7 +17454,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu24(const armral_cmplx_int16_t *restrict x, int32_t *v1652 = &v6[ostride * 15]; int32_t *v1661 = &v6[ostride * 7]; int32_t *v1670 = &v6[ostride * 23]; - int16x4_t v1678 = *(const int16x4_t *)v1264; + int16x4_t v1678 = vld1_s16((const int16_t *)v1264); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v1678), 15); float32x4_t v287 = vcombine_f32(v285, v285); float32x4_t v295 = vcombine_f32(v293, v293); @@ -17462,28 +17462,28 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu24(const armral_cmplx_int16_t *restrict x, float32x4_t v370 = vcombine_f32(v368, v368); float32x4_t v438 = vcombine_f32(v436, v436); float32x4_t v456 = vcombine_f32(v454, v454); - int16x4_t v1674 = *(const int16x4_t *)v1245; - int16x4_t v1676 = *(const int16x4_t *)v1254; - int16x4_t v1680 = *(const int16x4_t *)v1273; - int16x4_t v1682 = *(const int16x4_t *)v1282; - int16x4_t v1684 = *(const int16x4_t *)v1291; - int16x4_t v1686 = *(const int16x4_t *)v1300; - int16x4_t v1688 = *(const int16x4_t *)v1309; - int16x4_t v1690 = *(const int16x4_t *)v1318; - int16x4_t v1692 = *(const int16x4_t *)v1327; - int16x4_t v1696 = *(const int16x4_t *)v1345; - int16x4_t v1698 = *(const int16x4_t *)v1354; - int16x4_t v1700 = *(const int16x4_t *)v1363; - int16x4_t v1702 = *(const int16x4_t *)v1372; - int16x4_t v1704 = *(const int16x4_t *)v1381; - int16x4_t v1706 = *(const int16x4_t *)v1390; - int16x4_t v1708 = *(const int16x4_t *)v1399; - int16x4_t v1710 = *(const int16x4_t *)v1408; - int16x4_t v1712 = *(const int16x4_t *)v1417; - int16x4_t v1714 = *(const int16x4_t *)v1426; - int16x4_t v1716 = *(const int16x4_t *)v1435; - int16x4_t v1718 = *(const int16x4_t *)v1444; - int16x4_t v1720 = *(const int16x4_t *)v1453; + int16x4_t v1674 = vld1_s16((const int16_t *)v1245); + int16x4_t v1676 = vld1_s16((const int16_t *)v1254); + int16x4_t v1680 = vld1_s16((const int16_t *)v1273); + int16x4_t v1682 = vld1_s16((const int16_t *)v1282); + int16x4_t v1684 = vld1_s16((const int16_t *)v1291); + int16x4_t v1686 = vld1_s16((const int16_t *)v1300); + int16x4_t v1688 = vld1_s16((const int16_t *)v1309); + int16x4_t v1690 = vld1_s16((const int16_t *)v1318); + int16x4_t v1692 = vld1_s16((const int16_t *)v1327); + int16x4_t v1696 = vld1_s16((const int16_t *)v1345); + int16x4_t v1698 = vld1_s16((const int16_t *)v1354); + int16x4_t v1700 = vld1_s16((const int16_t *)v1363); + int16x4_t v1702 = vld1_s16((const int16_t *)v1372); + int16x4_t v1704 = vld1_s16((const int16_t *)v1381); + int16x4_t v1706 = vld1_s16((const int16_t *)v1390); + int16x4_t v1708 = vld1_s16((const int16_t *)v1399); + int16x4_t v1710 = vld1_s16((const int16_t *)v1408); + int16x4_t v1712 = vld1_s16((const int16_t *)v1417); + int16x4_t v1714 = vld1_s16((const int16_t *)v1426); + int16x4_t v1716 = vld1_s16((const int16_t *)v1435); + int16x4_t v1718 = vld1_s16((const int16_t *)v1444); + int16x4_t v1720 = vld1_s16((const int16_t *)v1453); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1674), 15); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1676), 15); float32x4_t v55 = vcvtq_n_f32_s32(vmovl_s16(v1680), 15); @@ -17643,8 +17643,8 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu24(const armral_cmplx_int16_t *restrict x, float32x4_t v470 = vsubq_f32(v468, v410); float32x4_t v577 = vaddq_f32(v576, v418); float32x4_t v578 = vsubq_f32(v576, v418); - *(int16x4_t *)v1463 = v473; - *(int16x4_t *)v1571 = v581; + vst1_s16((int16_t *)v1463, v473); + vst1_s16((int16_t *)v1571, v581); float32x4_t v308 = vaddq_f32(v304, v306); float32x4_t v309 = vsubq_f32(v304, v306); float32x4_t v310 = vaddq_f32(v305, v307); @@ -17669,12 +17669,12 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu24(const armral_cmplx_int16_t *restrict x, float32x4_t v632 = vsubq_f32(v630, v458); float32x4_t v657 = vaddq_f32(v308, v383); int16x4_t v662 = vqmovn_s32(vcvtq_n_s32_f32(v308, 15)); - *(int16x4_t *)v1472 = v481; - *(int16x4_t *)v1481 = v489; - *(int16x4_t *)v1517 = v527; - *(int16x4_t *)v1580 = v589; - *(int16x4_t *)v1589 = v597; - *(int16x4_t *)v1625 = v635; + vst1_s16((int16_t *)v1472, v481); + vst1_s16((int16_t *)v1481, v489); + vst1_s16((int16_t *)v1517, v527); + vst1_s16((int16_t *)v1580, v589); + vst1_s16((int16_t *)v1589, v597); + vst1_s16((int16_t *)v1625, v635); float32x4_t v496 = vaddq_f32(v495, v465); float32x4_t v497 = vsubq_f32(v495, v465); int16x4_t v535 = vqmovn_s32(vcvtq_n_s32_f32(v524, 15)); @@ -17687,10 +17687,10 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu24(const armral_cmplx_int16_t *restrict x, int16x4_t v651 = vqmovn_s32(vcvtq_n_s32_f32(v631, 15)); float32x4_t v658 = vaddq_f32(v657, v464); float32x4_t v659 = vsubq_f32(v657, v464); - *(int16x4_t *)v1490 = v500; - *(int16x4_t *)v1544 = v554; - *(int16x4_t *)v1598 = v608; - *(int16x4_t *)v1652 = v662; + vst1_s16((int16_t *)v1490, v500); + vst1_s16((int16_t *)v1544, v554); + vst1_s16((int16_t *)v1598, v608); + vst1_s16((int16_t *)v1652, v662); int16x4_t v508 = vqmovn_s32(vcvtq_n_s32_f32(v497, 15)); int16x4_t v516 = vqmovn_s32(vcvtq_n_s32_f32(v496, 15)); int16x4_t v562 = vqmovn_s32(vcvtq_n_s32_f32(v551, 15)); @@ -17699,18 +17699,18 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu24(const armral_cmplx_int16_t *restrict x, int16x4_t v624 = vqmovn_s32(vcvtq_n_s32_f32(v604, 15)); int16x4_t v670 = vqmovn_s32(vcvtq_n_s32_f32(v659, 15)); int16x4_t v678 = vqmovn_s32(vcvtq_n_s32_f32(v658, 15)); - *(int16x4_t *)v1526 = v535; - *(int16x4_t *)v1535 = v543; - *(int16x4_t *)v1634 = v643; - *(int16x4_t *)v1643 = v651; - *(int16x4_t *)v1499 = v508; - *(int16x4_t *)v1508 = v516; - *(int16x4_t *)v1553 = v562; - *(int16x4_t *)v1562 = v570; - *(int16x4_t *)v1607 = v616; - *(int16x4_t *)v1616 = v624; - *(int16x4_t *)v1661 = v670; - *(int16x4_t *)v1670 = v678; + vst1_s16((int16_t *)v1526, v535); + vst1_s16((int16_t *)v1535, v543); + vst1_s16((int16_t *)v1634, v643); + vst1_s16((int16_t *)v1643, v651); + vst1_s16((int16_t *)v1499, v508); + vst1_s16((int16_t *)v1508, v516); + vst1_s16((int16_t *)v1553, v562); + vst1_s16((int16_t *)v1562, v570); + vst1_s16((int16_t *)v1607, v616); + vst1_s16((int16_t *)v1616, v624); + vst1_s16((int16_t *)v1661, v670); + vst1_s16((int16_t *)v1670, v678); v5 += 2 * 1; v6 += 2 * 1; } @@ -18779,7 +18779,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x2_t v1739 = (float32x2_t){v1738, v1738}; const int32_t *v3210 = &v5[0]; int32_t *v3436 = &v6[0]; - int16x4_t v3666 = *(const int16x4_t *)v3255; + int16x4_t v3666 = vld1_s16((const int16_t *)v3255); float32x4_t v201 = vcvtq_n_f32_s32(vmovl_s16(v3666), 15); float32x2_t v942 = (float32x2_t){v941, v944}; float32x4_t v1062 = vcombine_f32(v1061, v1061); @@ -18850,7 +18850,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu25(const armral_cmplx_int16_t *restrict x, int32_t *v3634 = &v6[ostride * 14]; int32_t *v3643 = &v6[ostride * 19]; int32_t *v3652 = &v6[ostride * 24]; - int16x4_t v3656 = *(const int16x4_t *)v3210; + int16x4_t v3656 = vld1_s16((const int16_t *)v3210); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v3656), 15); float32x4_t v946 = vcombine_f32(v942, v942); float32x4_t v1070 = vcombine_f32(v1068, v1068); @@ -18863,29 +18863,29 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x4_t v1617 = vcombine_f32(v1615, v1615); float32x4_t v1638 = vcombine_f32(v1636, v1636); float32x4_t v1712 = vcombine_f32(v1710, v1710); - int16x4_t v3658 = *(const int16x4_t *)v3219; - int16x4_t v3660 = *(const int16x4_t *)v3228; - int16x4_t v3662 = *(const int16x4_t *)v3237; - int16x4_t v3664 = *(const int16x4_t *)v3246; - int16x4_t v3668 = *(const int16x4_t *)v3264; - int16x4_t v3670 = *(const int16x4_t *)v3273; - int16x4_t v3672 = *(const int16x4_t *)v3282; - int16x4_t v3674 = *(const int16x4_t *)v3291; - int16x4_t v3676 = *(const int16x4_t *)v3300; - int16x4_t v3678 = *(const int16x4_t *)v3309; - int16x4_t v3680 = *(const int16x4_t *)v3318; - int16x4_t v3682 = *(const int16x4_t *)v3327; - int16x4_t v3684 = *(const int16x4_t *)v3336; - int16x4_t v3686 = *(const int16x4_t *)v3345; - int16x4_t v3688 = *(const int16x4_t *)v3354; - int16x4_t v3690 = *(const int16x4_t *)v3363; - int16x4_t v3692 = *(const int16x4_t *)v3372; - int16x4_t v3694 = *(const int16x4_t *)v3381; - int16x4_t v3696 = *(const int16x4_t *)v3390; - int16x4_t v3698 = *(const int16x4_t *)v3399; - int16x4_t v3700 = *(const int16x4_t *)v3408; - int16x4_t v3702 = *(const int16x4_t *)v3417; - int16x4_t v3704 = *(const int16x4_t *)v3426; + int16x4_t v3658 = vld1_s16((const int16_t *)v3219); + int16x4_t v3660 = vld1_s16((const int16_t *)v3228); + int16x4_t v3662 = vld1_s16((const int16_t *)v3237); + int16x4_t v3664 = vld1_s16((const int16_t *)v3246); + int16x4_t v3668 = vld1_s16((const int16_t *)v3264); + int16x4_t v3670 = vld1_s16((const int16_t *)v3273); + int16x4_t v3672 = vld1_s16((const int16_t *)v3282); + int16x4_t v3674 = vld1_s16((const int16_t *)v3291); + int16x4_t v3676 = vld1_s16((const int16_t *)v3300); + int16x4_t v3678 = vld1_s16((const int16_t *)v3309); + int16x4_t v3680 = vld1_s16((const int16_t *)v3318); + int16x4_t v3682 = vld1_s16((const int16_t *)v3327); + int16x4_t v3684 = vld1_s16((const int16_t *)v3336); + int16x4_t v3686 = vld1_s16((const int16_t *)v3345); + int16x4_t v3688 = vld1_s16((const int16_t *)v3354); + int16x4_t v3690 = vld1_s16((const int16_t *)v3363); + int16x4_t v3692 = vld1_s16((const int16_t *)v3372); + int16x4_t v3694 = vld1_s16((const int16_t *)v3381); + int16x4_t v3696 = vld1_s16((const int16_t *)v3390); + int16x4_t v3698 = vld1_s16((const int16_t *)v3399); + int16x4_t v3700 = vld1_s16((const int16_t *)v3408); + int16x4_t v3702 = vld1_s16((const int16_t *)v3417); + int16x4_t v3704 = vld1_s16((const int16_t *)v3426); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v3658), 15); float32x4_t v44 = vcvtq_n_f32_s32(vmovl_s16(v3660), 15); float32x4_t v52 = vcvtq_n_f32_s32(vmovl_s16(v3662), 15); @@ -19229,7 +19229,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x4_t v1625 = vmulq_f32(v1591, v1740); float32x4_t v1641 = vsubq_f32(v1605, v1640); float32x4_t v1646 = vmulq_f32(v1605, v1740); - *(int16x4_t *)v3436 = v991; + vst1_s16((int16_t *)v3436, v991); float32x4_t v981 = vsubq_f32(v980, v975); float32x4_t v1022 = vsubq_f32(v975, v1021); float32x4_t v1035 = vmulq_f32(v975, v1740); @@ -19290,8 +19290,8 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x4_t v1680 = vaddq_f32(v193, v1648); float32x4_t v1694 = vrev64q_f32(v1661); float32x4_t v1711 = vrev64q_f32(v1679); - *(int16x4_t *)v3454 = v1025; - *(int16x4_t *)v3526 = v1337; + vst1_s16((int16_t *)v3454, v1025); + vst1_s16((int16_t *)v3526, v1337); int16x4_t v1053 = vqmovn_s32(vcvtq_n_s32_f32(v1050, 15)); float32x4_t v1148 = vsubq_f32(v1136, v1147); float32x4_t v1153 = vmulq_f32(v1136, v1740); @@ -19304,10 +19304,10 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu25(const armral_cmplx_int16_t *restrict x, int16x4_t v1683 = vqmovn_s32(vcvtq_n_s32_f32(v1680, 15)); float32x4_t v1696 = vmulq_f32(v1694, v1712); float32x4_t v1713 = vmulq_f32(v1711, v1712); - *(int16x4_t *)v3445 = v1008; - *(int16x4_t *)v3463 = v1039; - *(int16x4_t *)v3481 = v1164; - *(int16x4_t *)v3571 = v1510; + vst1_s16((int16_t *)v3445, v1008); + vst1_s16((int16_t *)v3463, v1039); + vst1_s16((int16_t *)v3481, v1164); + vst1_s16((int16_t *)v3571, v1510); float32x4_t v1154 = vsubq_f32(v1153, v1148); float32x4_t v1195 = vsubq_f32(v1148, v1194); float32x4_t v1208 = vmulq_f32(v1148, v1740); @@ -19320,8 +19320,8 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x4_t v1554 = vmulq_f32(v1494, v1740); float32x4_t v1667 = vsubq_f32(v1655, v1666); float32x4_t v1672 = vmulq_f32(v1655, v1740); - *(int16x4_t *)v3472 = v1053; - *(int16x4_t *)v3616 = v1683; + vst1_s16((int16_t *)v3472, v1053); + vst1_s16((int16_t *)v3616, v1683); float32x4_t v1178 = vsubq_f32(v1154, v1177); int16x4_t v1198 = vqmovn_s32(vcvtq_n_s32_f32(v1195, 15)); float32x4_t v1209 = vsubq_f32(v1208, v1195); @@ -19336,7 +19336,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x4_t v1673 = vsubq_f32(v1672, v1667); float32x4_t v1714 = vsubq_f32(v1667, v1713); float32x4_t v1727 = vmulq_f32(v1667, v1740); - *(int16x4_t *)v3544 = v1371; + vst1_s16((int16_t *)v3544, v1371); int16x4_t v1181 = vqmovn_s32(vcvtq_n_s32_f32(v1178, 15)); int16x4_t v1212 = vqmovn_s32(vcvtq_n_s32_f32(v1209, 15)); float32x4_t v1223 = vsubq_f32(v1222, v1178); @@ -19348,27 +19348,27 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu25(const armral_cmplx_int16_t *restrict x, int16x4_t v1717 = vqmovn_s32(vcvtq_n_s32_f32(v1714, 15)); float32x4_t v1728 = vsubq_f32(v1727, v1714); float32x4_t v1741 = vmulq_f32(v1673, v1740); - *(int16x4_t *)v3499 = v1198; - *(int16x4_t *)v3535 = v1354; - *(int16x4_t *)v3553 = v1385; - *(int16x4_t *)v3589 = v1544; + vst1_s16((int16_t *)v3499, v1198); + vst1_s16((int16_t *)v3535, v1354); + vst1_s16((int16_t *)v3553, v1385); + vst1_s16((int16_t *)v3589, v1544); int16x4_t v1226 = vqmovn_s32(vcvtq_n_s32_f32(v1223, 15)); int16x4_t v1572 = vqmovn_s32(vcvtq_n_s32_f32(v1569, 15)); int16x4_t v1700 = vqmovn_s32(vcvtq_n_s32_f32(v1697, 15)); int16x4_t v1731 = vqmovn_s32(vcvtq_n_s32_f32(v1728, 15)); float32x4_t v1742 = vsubq_f32(v1741, v1697); - *(int16x4_t *)v3490 = v1181; - *(int16x4_t *)v3508 = v1212; - *(int16x4_t *)v3562 = v1399; - *(int16x4_t *)v3580 = v1527; - *(int16x4_t *)v3598 = v1558; - *(int16x4_t *)v3634 = v1717; + vst1_s16((int16_t *)v3490, v1181); + vst1_s16((int16_t *)v3508, v1212); + vst1_s16((int16_t *)v3562, v1399); + vst1_s16((int16_t *)v3580, v1527); + vst1_s16((int16_t *)v3598, v1558); + vst1_s16((int16_t *)v3634, v1717); int16x4_t v1745 = vqmovn_s32(vcvtq_n_s32_f32(v1742, 15)); - *(int16x4_t *)v3517 = v1226; - *(int16x4_t *)v3607 = v1572; - *(int16x4_t *)v3625 = v1700; - *(int16x4_t *)v3643 = v1731; - *(int16x4_t *)v3652 = v1745; + vst1_s16((int16_t *)v3517, v1226); + vst1_s16((int16_t *)v3607, v1572); + vst1_s16((int16_t *)v3625, v1700); + vst1_s16((int16_t *)v3643, v1731); + vst1_s16((int16_t *)v3652, v1745); v5 += 2 * 1; v6 += 2 * 1; } @@ -20867,7 +20867,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu32(const armral_cmplx_int16_t *restrict x, float32x2_t v1199 = (float32x2_t){v1197, v1198}; const int32_t *v2277 = &v5[0]; int32_t *v2566 = &v6[0]; - int16x4_t v2881 = *(const int16x4_t *)v2421; + int16x4_t v2881 = vld1_s16((const int16_t *)v2421); float32x4_t v404 = vcvtq_n_f32_s32(vmovl_s16(v2881), 15); float32x4_t v726 = vcombine_f32(v725, v725); float32x4_t v800 = vcombine_f32(v799, v799); @@ -20951,7 +20951,7 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu32(const armral_cmplx_int16_t *restrict x, int32_t *v2827 = &v6[ostride * 15]; int32_t *v2836 = &v6[ostride * 23]; int32_t *v2845 = &v6[ostride * 31]; - int16x4_t v2849 = *(const int16x4_t *)v2277; + int16x4_t v2849 = vld1_s16((const int16_t *)v2277); float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v2849), 15); float32x4_t v808 = vcombine_f32(v806, v806); float32x4_t v882 = vcombine_f32(v880, v880); @@ -20963,36 +20963,36 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu32(const armral_cmplx_int16_t *restrict x, float32x4_t v1178 = vcombine_f32(v1176, v1176); float32x4_t v1191 = vcombine_f32(v1189, v1189); float32x4_t v1203 = vcombine_f32(v1201, v1201); - int16x4_t v2851 = *(const int16x4_t *)v2286; - int16x4_t v2853 = *(const int16x4_t *)v2295; - int16x4_t v2855 = *(const int16x4_t *)v2304; - int16x4_t v2857 = *(const int16x4_t *)v2313; - int16x4_t v2859 = *(const int16x4_t *)v2322; - int16x4_t v2861 = *(const int16x4_t *)v2331; - int16x4_t v2863 = *(const int16x4_t *)v2340; - int16x4_t v2865 = *(const int16x4_t *)v2349; - int16x4_t v2867 = *(const int16x4_t *)v2358; - int16x4_t v2869 = *(const int16x4_t *)v2367; - int16x4_t v2871 = *(const int16x4_t *)v2376; - int16x4_t v2873 = *(const int16x4_t *)v2385; - int16x4_t v2875 = *(const int16x4_t *)v2394; - int16x4_t v2877 = *(const int16x4_t *)v2403; - int16x4_t v2879 = *(const int16x4_t *)v2412; - int16x4_t v2883 = *(const int16x4_t *)v2430; - int16x4_t v2885 = *(const int16x4_t *)v2439; - int16x4_t v2887 = *(const int16x4_t *)v2448; - int16x4_t v2889 = *(const int16x4_t *)v2457; - int16x4_t v2891 = *(const int16x4_t *)v2466; - int16x4_t v2893 = *(const int16x4_t *)v2475; - int16x4_t v2895 = *(const int16x4_t *)v2484; - int16x4_t v2897 = *(const int16x4_t *)v2493; - int16x4_t v2899 = *(const int16x4_t *)v2502; - int16x4_t v2901 = *(const int16x4_t *)v2511; - int16x4_t v2903 = *(const int16x4_t *)v2520; - int16x4_t v2905 = *(const int16x4_t *)v2529; - int16x4_t v2907 = *(const int16x4_t *)v2538; - int16x4_t v2909 = *(const int16x4_t *)v2547; - int16x4_t v2911 = *(const int16x4_t *)v2556; + int16x4_t v2851 = vld1_s16((const int16_t *)v2286); + int16x4_t v2853 = vld1_s16((const int16_t *)v2295); + int16x4_t v2855 = vld1_s16((const int16_t *)v2304); + int16x4_t v2857 = vld1_s16((const int16_t *)v2313); + int16x4_t v2859 = vld1_s16((const int16_t *)v2322); + int16x4_t v2861 = vld1_s16((const int16_t *)v2331); + int16x4_t v2863 = vld1_s16((const int16_t *)v2340); + int16x4_t v2865 = vld1_s16((const int16_t *)v2349); + int16x4_t v2867 = vld1_s16((const int16_t *)v2358); + int16x4_t v2869 = vld1_s16((const int16_t *)v2367); + int16x4_t v2871 = vld1_s16((const int16_t *)v2376); + int16x4_t v2873 = vld1_s16((const int16_t *)v2385); + int16x4_t v2875 = vld1_s16((const int16_t *)v2394); + int16x4_t v2877 = vld1_s16((const int16_t *)v2403); + int16x4_t v2879 = vld1_s16((const int16_t *)v2412); + int16x4_t v2883 = vld1_s16((const int16_t *)v2430); + int16x4_t v2885 = vld1_s16((const int16_t *)v2439); + int16x4_t v2887 = vld1_s16((const int16_t *)v2448); + int16x4_t v2889 = vld1_s16((const int16_t *)v2457); + int16x4_t v2891 = vld1_s16((const int16_t *)v2466); + int16x4_t v2893 = vld1_s16((const int16_t *)v2475); + int16x4_t v2895 = vld1_s16((const int16_t *)v2484); + int16x4_t v2897 = vld1_s16((const int16_t *)v2493); + int16x4_t v2899 = vld1_s16((const int16_t *)v2502); + int16x4_t v2901 = vld1_s16((const int16_t *)v2511); + int16x4_t v2903 = vld1_s16((const int16_t *)v2520); + int16x4_t v2905 = vld1_s16((const int16_t *)v2529); + int16x4_t v2907 = vld1_s16((const int16_t *)v2538); + int16x4_t v2909 = vld1_s16((const int16_t *)v2547); + int16x4_t v2911 = vld1_s16((const int16_t *)v2556); float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v2851), 15); float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v2853), 15); float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v2855), 15); @@ -21204,8 +21204,8 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu32(const armral_cmplx_int16_t *restrict x, float32x4_t v690 = vaddq_f32(v268, v686); float32x4_t v971 = vaddq_f32(v949, v957); float32x4_t v972 = vaddq_f32(v962, v970); - *(int16x4_t *)v2566 = v693; - *(int16x4_t *)v2584 = v709; + vst1_s16((int16_t *)v2566, v693); + vst1_s16((int16_t *)v2584, v709); float32x4_t v156 = vmulq_f32(v154, v1203); float32x4_t v299 = vaddq_f32(v297, v298); float32x4_t v300 = vsubq_f32(v298, v297); @@ -21250,8 +21250,8 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu32(const armral_cmplx_int16_t *restrict x, float32x4_t v984 = vsubq_f32(v269, v973); float32x4_t v1121 = vaddq_f32(v1119, v1120); float32x4_t v1122 = vsubq_f32(v1120, v1119); - *(int16x4_t *)v2575 = v701; - *(int16x4_t *)v2593 = v717; + vst1_s16((int16_t *)v2575, v701); + vst1_s16((int16_t *)v2593, v717); float32x4_t v308 = vmulq_f32(v306, v1203); float32x4_t v353 = vsubq_f32(v118, v350); float32x4_t v354 = vaddq_f32(v118, v350); @@ -21293,8 +21293,8 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu32(const armral_cmplx_int16_t *restrict x, float32x4_t v1130 = vmulq_f32(v1128, v1203); float32x4_t v1131 = vaddq_f32(v353, v1121); float32x4_t v1132 = vsubq_f32(v353, v1121); - *(int16x4_t *)v2710 = v989; - *(int16x4_t *)v2728 = v1005; + vst1_s16((int16_t *)v2710, v989); + vst1_s16((int16_t *)v2728, v1005); float32x4_t v758 = vrev64q_f32(v752); float32x4_t v761 = vaddq_f32(v309, v751); float32x4_t v762 = vsubq_f32(v309, v751); @@ -21313,8 +21313,8 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu32(const armral_cmplx_int16_t *restrict x, int16x4_t v1153 = vqmovn_s32(vcvtq_n_s32_f32(v1132, 15)); float32x4_t v1193 = vfmaq_f32(v1171, v1177, v1178); float32x4_t v1194 = vfmaq_f32(v1184, v1190, v1191); - *(int16x4_t *)v2638 = v841; - *(int16x4_t *)v2656 = v857; + vst1_s16((int16_t *)v2638, v841); + vst1_s16((int16_t *)v2656, v857); float32x4_t v760 = vmulq_f32(v758, v1203); int16x4_t v767 = vqmovn_s32(vcvtq_n_s32_f32(v761, 15)); int16x4_t v783 = vqmovn_s32(vcvtq_n_s32_f32(v762, 15)); @@ -21329,10 +21329,10 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu32(const armral_cmplx_int16_t *restrict x, int16x4_t v1161 = vqmovn_s32(vcvtq_n_s32_f32(v1134, 15)); float32x4_t v1195 = vaddq_f32(v1193, v1194); float32x4_t v1196 = vsubq_f32(v1194, v1193); - *(int16x4_t *)v2719 = v997; - *(int16x4_t *)v2737 = v1013; - *(int16x4_t *)v2782 = v1137; - *(int16x4_t *)v2800 = v1153; + vst1_s16((int16_t *)v2719, v997); + vst1_s16((int16_t *)v2737, v1013); + vst1_s16((int16_t *)v2782, v1137); + vst1_s16((int16_t *)v2800, v1153); float32x4_t v763 = vsubq_f32(v310, v760); float32x4_t v764 = vaddq_f32(v310, v760); float32x4_t v906 = vrev64q_f32(v900); @@ -21343,14 +21343,14 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu32(const armral_cmplx_int16_t *restrict x, float32x4_t v1202 = vrev64q_f32(v1196); float32x4_t v1205 = vaddq_f32(v395, v1195); float32x4_t v1206 = vsubq_f32(v395, v1195); - *(int16x4_t *)v2602 = v767; - *(int16x4_t *)v2620 = v783; - *(int16x4_t *)v2647 = v849; - *(int16x4_t *)v2665 = v865; - *(int16x4_t *)v2746 = v1063; - *(int16x4_t *)v2764 = v1079; - *(int16x4_t *)v2791 = v1145; - *(int16x4_t *)v2809 = v1161; + vst1_s16((int16_t *)v2602, v767); + vst1_s16((int16_t *)v2620, v783); + vst1_s16((int16_t *)v2647, v849); + vst1_s16((int16_t *)v2665, v865); + vst1_s16((int16_t *)v2746, v1063); + vst1_s16((int16_t *)v2764, v1079); + vst1_s16((int16_t *)v2791, v1145); + vst1_s16((int16_t *)v2809, v1161); int16x4_t v775 = vqmovn_s32(vcvtq_n_s32_f32(v763, 15)); int16x4_t v791 = vqmovn_s32(vcvtq_n_s32_f32(v764, 15)); float32x4_t v908 = vmulq_f32(v906, v1203); @@ -21365,22 +21365,22 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu32(const armral_cmplx_int16_t *restrict x, float32x4_t v912 = vaddq_f32(v394, v908); float32x4_t v1207 = vsubq_f32(v396, v1204); float32x4_t v1208 = vaddq_f32(v396, v1204); - *(int16x4_t *)v2611 = v775; - *(int16x4_t *)v2629 = v791; - *(int16x4_t *)v2674 = v915; - *(int16x4_t *)v2692 = v931; - *(int16x4_t *)v2755 = v1071; - *(int16x4_t *)v2773 = v1087; - *(int16x4_t *)v2818 = v1211; - *(int16x4_t *)v2836 = v1227; + vst1_s16((int16_t *)v2611, v775); + vst1_s16((int16_t *)v2629, v791); + vst1_s16((int16_t *)v2674, v915); + vst1_s16((int16_t *)v2692, v931); + vst1_s16((int16_t *)v2755, v1071); + vst1_s16((int16_t *)v2773, v1087); + vst1_s16((int16_t *)v2818, v1211); + vst1_s16((int16_t *)v2836, v1227); int16x4_t v923 = vqmovn_s32(vcvtq_n_s32_f32(v911, 15)); int16x4_t v939 = vqmovn_s32(vcvtq_n_s32_f32(v912, 15)); int16x4_t v1219 = vqmovn_s32(vcvtq_n_s32_f32(v1207, 15)); int16x4_t v1235 = vqmovn_s32(vcvtq_n_s32_f32(v1208, 15)); - *(int16x4_t *)v2683 = v923; - *(int16x4_t *)v2701 = v939; - *(int16x4_t *)v2827 = v1219; - *(int16x4_t *)v2845 = v1235; + vst1_s16((int16_t *)v2683, v923); + vst1_s16((int16_t *)v2701, v939); + vst1_s16((int16_t *)v2827, v1219); + vst1_s16((int16_t *)v2845, v1235); v5 += 2 * 1; v6 += 2 * 1; } diff --git a/src/LowerPHY/FFT/fft_execute.cpp b/src/LowerPHY/FFT/fft_execute.cpp index f45d07fef36e86cbaf4237d40fa63cd8e0a43b14..fc23ddd71f22b80eddd294e285ebd332898a5915 100644 --- a/src/LowerPHY/FFT/fft_execute.cpp +++ b/src/LowerPHY/FFT/fft_execute.cpp @@ -4,11 +4,8 @@ */ #include "fft_execute.hpp" -#include "fft_level.hpp" #include "fft_plan.hpp" -#include - namespace { template @@ -21,6 +18,9 @@ inline void execute_single_level(const armral_fft_plan_t *p, const Tx *x, Ty *y, if (lev->r) { armral::fft::execute_rader(lev->r, x, y, istride, ostride, nullptr, howmany, idist, odist); + } else if (lev->bs) { + armral::fft::execute_bluestein(lev->bs, x, y, istride, ostride, + nullptr, howmany, idist, odist); } else { assert(lev->kernel); lev->kernel(x, y, istride, ostride, howmany, p->dir); @@ -37,7 +37,7 @@ inline void execute_dit(const armral::fft::lev_base_t *lev, const Tx *x, Ty *y, int n2_istride = level->how_many * istride; int n1_ostride = level->how_many * level->n2 * ostride; int n1_istride = level->how_many * level->n2 * istride; - assert(level->kernel || level->r); + assert(level->kernel || level->r || level->bs); if (level->kernel) { assert(n2_ostride == level->how_many); if (n2_istride == level->how_many) { @@ -57,13 +57,19 @@ inline void execute_dit(const armral::fft::lev_base_t *lev, const Tx *x, Ty *y, } } } else { - // Rader's + // Rader's or Bluestein's for (int hm = 0; hm != level->how_many; ++hm) { const Tx *x_ptr = &x[hm]; Ty *y_ptr = &y[hm]; - armral::fft::execute_rader(level->r, x_ptr, y_ptr, n1_istride, - n1_ostride, nullptr, level->n2, - n2_istride, n2_ostride); + if (level->r) { + armral::fft::execute_rader( + level->r, x_ptr, y_ptr, n1_istride, n1_ostride, nullptr, level->n2, + n2_istride, n2_ostride); + } else { + armral::fft::execute_bluestein( + level->bs, x_ptr, y_ptr, n1_istride, n1_ostride, nullptr, level->n2, + n2_istride, n2_ostride); + } } } } @@ -78,7 +84,7 @@ inline void execute_dit_ab_twid(const armral::fft::lev_base_t *lev, const Tx *x, int n2_istride = level->how_many * level->n1 * istride; int n1_ostride = level->how_many * level->n2 * ostride; int n1_istride = level->how_many * istride; - assert(level->kernel || level->r); + assert(level->kernel || level->r || level->bs); if (level->kernel) { assert(level->ab_twid_gu_kernel); for (int hm = 0; hm != level->how_many; ++hm) { @@ -99,9 +105,15 @@ inline void execute_dit_ab_twid(const armral::fft::lev_base_t *lev, const Tx *x, for (int hm = 0; hm < level->how_many; ++hm) { const Tx *x_ptr = &x[hm]; Ty *y_ptr = &y[hm]; - armral::fft::execute_rader(level->r, x_ptr, y_ptr, n1_istride, - n1_ostride, level->twids, - level->n2, n2_istride, n2_ostride); + if (level->r) { + armral::fft::execute_rader( + level->r, x_ptr, y_ptr, n1_istride, n1_ostride, level->twids, + level->n2, n2_istride, n2_ostride); + } else { + armral::fft::execute_bluestein( + level->bs, x_ptr, y_ptr, n1_istride, n1_ostride, level->twids, + level->n2, n2_istride, n2_ostride); + } } } } @@ -116,7 +128,7 @@ inline void execute_dit_ac_twid(const armral::fft::lev_base_t *lev, const Tx *x, int n2_istride = level->how_many * level->n1 * istride; int n1_ostride = level->how_many * level->n2 * ostride; int n1_istride = level->how_many * istride; - assert(level->kernel || level->r); + assert(level->kernel || level->r || level->bs); if (level->kernel) { assert(level->ac_twid_kernel); level->kernel(x, y, n1_istride, n1_ostride, level->how_many, lev->dir); @@ -137,9 +149,15 @@ inline void execute_dit_ac_twid(const armral::fft::lev_base_t *lev, const Tx *x, for (int hm = 0; hm < level->how_many; ++hm) { const Tx *x_ptr = &x[hm]; Ty *y_ptr = &y[hm]; - armral::fft::execute_rader(level->r, x_ptr, y_ptr, n1_istride, - n1_ostride, level->twids, - level->n2, n2_istride, n2_ostride); + if (level->r) { + armral::fft::execute_rader( + level->r, x_ptr, y_ptr, n1_istride, n1_ostride, level->twids, + level->n2, n2_istride, n2_ostride); + } else { + armral::fft::execute_bluestein( + level->bs, x_ptr, y_ptr, n1_istride, n1_ostride, level->twids, + level->n2, n2_istride, n2_ostride); + } } } } diff --git a/src/LowerPHY/FFT/fft_level.hpp b/src/LowerPHY/FFT/fft_level.hpp index fa0e64e75b6f11093428bbee964a23b16414f2e6..f254a383fb88ce6b7bf0658dd7e69a5fe35b7f14 100644 --- a/src/LowerPHY/FFT/fft_level.hpp +++ b/src/LowerPHY/FFT/fft_level.hpp @@ -4,6 +4,7 @@ */ #pragma once +#include "bluestein.hpp" #include "fft_types.hpp" #include "rader.hpp" @@ -16,6 +17,9 @@ namespace armral::fft { template struct rader; +template +struct bluestein; + struct lev_base_t { int n; int n1; @@ -32,6 +36,7 @@ struct lev_base_t { virtual ~lev_base_t() = default; + // GCOVR_EXCL_START void operator delete(void *ptr) noexcept { // For some reason the compilers complain about symbol not found in the // case that we do a debug build. We should not end up in the operator @@ -39,6 +44,8 @@ struct lev_base_t { assert(false && "Operator delete in lev_base_t called. This should not be possible"); } + + // GCOVR_EXCL_STOP }; template @@ -54,6 +61,7 @@ struct lev_t : public lev_base_t { fft_ab_twid_gs_func_t ab_twid_gs_kernel; fft_ac_twid_func_t ac_twid_kernel; rader r; + bluestein bs; lev_t() = delete; @@ -64,12 +72,13 @@ struct lev_t : public lev_base_t { fft_ab_twid_gu_func_t ab_twid_gu_kernel_in, fft_ab_twid_gs_func_t ab_twid_gs_kernel_in, fft_ac_twid_func_t ac_twid_kernel_in, - rader r_in) + rader r_in, bluestein bs_in) : lev_base_t(n_in, n1_in, n2_in, how_many_in, dir_in), twids(twids_in), kernel(kernel_in), ac_gu_kernel(ac_gu_kernel_in), ab_twid_gu_kernel(ab_twid_gu_kernel_in), ab_twid_gs_kernel(ab_twid_gs_kernel_in), - ac_twid_kernel(ac_twid_kernel_in), r(std::move(r_in)) {} + ac_twid_kernel(ac_twid_kernel_in), r(std::move(r_in)), + bs(std::move(bs_in)) {} ~lev_t() override; diff --git a/src/LowerPHY/FFT/fft_plan.cpp b/src/LowerPHY/FFT/fft_plan.cpp index 30635e1a474affed94e49147e1db65ccce752b61..689ae5ffe778746dbf334b87e44d345067db5f37 100644 --- a/src/LowerPHY/FFT/fft_plan.cpp +++ b/src/LowerPHY/FFT/fft_plan.cpp @@ -3,17 +3,13 @@ SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates */ #include "fft_plan.hpp" +#include "bluestein.hpp" #include "fft_cf32_kernel_lookup.h" #include "fft_cs16_kernel_lookup.h" -#include "fft_types.hpp" #include "rader.hpp" -#include -#include #include -#include #include -#include #ifdef ARMRAL_ARCH_SVE #include @@ -25,9 +21,10 @@ namespace { -constexpr int base_kernels[33] = {0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1}; +constexpr int len_base_kernels = 33; +constexpr int base_kernels[len_base_kernels] = { + 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1}; template Tw *make_twiddles(int n1, int n2, armral_fft_direction_t dir, @@ -40,7 +37,7 @@ Tw *make_twiddles(int n1, int n2, armral_fft_direction_t dir, if (want_conj_twids) { twids_len *= 2; } - Tw *twids = (Tw *)malloc(twids_len * sizeof(Tw)); + Tw *twids = static_cast(malloc(twids_len * sizeof(Tw))); int x = 0; float base_m = ((int)dir) * 2 * M_PI / (n1 * n2); for (int j = 1; j < n2; j += twid_interleave) { @@ -60,7 +57,7 @@ Tw *make_twiddles(int n1, int n2, armral_fft_direction_t dir, } inline int kernel_exists(int i) { - if (i > static_cast(sizeof(base_kernels) / sizeof(int))) { + if (i >= len_base_kernels) { return 0; } return base_kernels[i]; @@ -176,10 +173,11 @@ struct kernel_selection { armral::fft::fft_ab_twid_gs_func_t ab_twid_gs_kernel; armral::fft::fft_ac_twid_func_t ac_twid_kernel; - kernel_selection(decltype(base_kernel) base, decltype(ac_gu_kernel) ac_gu, - decltype(ab_twid_gu_kernel) ab_twid_gu, - decltype(ab_twid_gs_kernel) ab_twid_gs, - decltype(ac_twid_kernel) ac_twid) + explicit kernel_selection(decltype(base_kernel) base, + decltype(ac_gu_kernel) ac_gu, + decltype(ab_twid_gu_kernel) ab_twid_gu, + decltype(ab_twid_gs_kernel) ab_twid_gs, + decltype(ac_twid_kernel) ac_twid) : base_kernel(base), ac_gu_kernel(ac_gu), ab_twid_gu_kernel(ab_twid_gu), ab_twid_gs_kernel(ab_twid_gs), ac_twid_kernel(ac_twid) {} }; @@ -198,15 +196,15 @@ kernel_selection get_kernels(int n1, armral_fft_direction_t dir, auto ac_twid_kernel = want_twids ? get_ac_twiddle_base_kernel(n1, dir) : nullptr; assert(kernel); - return {kernel, ac_gu_kernel, ab_twid_gu_kernel, ab_twid_gs_kernel, - ac_twid_kernel}; + return kernel_selection{kernel, ac_gu_kernel, ab_twid_gu_kernel, + ab_twid_gs_kernel, ac_twid_kernel}; } struct factorize_result { int num_factors; int remainder; - factorize_result(int n) : num_factors(0), remainder(n) {} + explicit factorize_result(int n) : num_factors(0), remainder(n) {} factorize_result(const factorize_result &other) = delete; factorize_result(factorize_result &&other) = default; @@ -273,9 +271,9 @@ int factorize_descending(int n, armral_fft_direction_t dir, int max_nfacts, } template -armral::fft::lev_base_t *make_level_data(int n, int n1, int n2, int how_many, - armral_fft_direction_t dir, - bool want_twiddles, bool want_ac) { +armral::fft::lev_base_t * +make_level_data(int n, int n1, int n2, int how_many, armral_fft_direction_t dir, + bool want_twiddles, bool want_ac, bool allow_raders) { using level_type = armral::fft::lev_t; if (kernel_exists(n1)) { auto [kernel, ac_gu_kernel, ab_twid_gu_kernel, ab_twid_gs_kernel, @@ -290,20 +288,33 @@ armral::fft::lev_base_t *make_level_data(int n, int n1, int n2, int how_many, } return new level_type(n, n1, n2, how_many, dir, twids, kernel, ac_gu_kernel, ab_twid_gu_kernel, ab_twid_gs_kernel, ac_twid_kernel, - {}); + {}, {}); } - auto r = armral::fft::make_rader(n1, dir); - if (r.n == 0) { + if (!allow_raders) { return nullptr; } Tw *twids = want_twiddles ? make_twiddles(n1, n2, dir, 1, true) : nullptr; + auto maybe_r = armral::fft::make_rader(n1, dir); + if (maybe_r) { + auto r = std::move(*maybe_r); + if (r.n == 0) { + return nullptr; + } + return new level_type(n, n1, n2, how_many, dir, twids, nullptr, nullptr, + nullptr, nullptr, nullptr, std::move(r), {}); + } + auto bs = armral::fft::make_bluestein(n1, dir, base_kernels, + len_base_kernels); + if (bs.n == 0) { + return nullptr; + } return new level_type(n, n1, n2, how_many, dir, twids, nullptr, nullptr, - nullptr, nullptr, nullptr, std::move(r)); + nullptr, nullptr, nullptr, {}, std::move(bs)); } template int factorize(int n, armral_fft_direction_t dir, int max_levels, - armral::fft::lev_base_t **levels) { + armral::fft::lev_base_t **levels, bool allow_raders) { // search through the set of supported factors to find a suitable // factorization, then use that to build the level data structures. int factors[max_levels]; @@ -314,7 +325,7 @@ int factorize(int n, armral_fft_direction_t dir, int max_levels, int running_product = 1; - for (int fi = 0; fi < num_factors; ++fi) { + for (int fi = 0; fi < num_factors; fi++) { auto n1 = factors[fi]; int n2 = fi != 0 ? running_product : num_factors > 1 ? factors[1] : 1; int how_many = n / (n1 * n2); @@ -323,24 +334,24 @@ int factorize(int n, armral_fft_direction_t dir, int max_levels, if (num_factors == 1) { // Operating on a single level - input output and working types are as // specified for this function - levels[fi] = - make_level_data(n, n1, n2, how_many, dir, false, false); + levels[fi] = make_level_data(n, n1, n2, how_many, dir, + false, false, allow_raders); } else { // We have multiple levels, and are currently dealing with the first // level. Transform data to the working type from the input type - levels[fi] = - make_level_data(n, n1, n2, how_many, dir, false, false); + levels[fi] = make_level_data(n, n1, n2, how_many, dir, + false, false, allow_raders); } } else if (fi == num_factors - 1) { // We have multiple levels and are currently dealing with the last level. // Transform data from the working type to the output type - levels[fi] = - make_level_data(n, n1, n2, how_many, dir, true, false); + levels[fi] = make_level_data(n, n1, n2, how_many, dir, true, + false, allow_raders); } else { // We have multiple levels and are currently dealing with an intermediate // level (i.e. not first or last). All work is done in the working type - levels[fi] = - make_level_data(n, n1, n2, how_many, dir, true, true); + levels[fi] = make_level_data(n, n1, n2, how_many, dir, true, + true, allow_raders); } if (!levels[fi]) { @@ -356,7 +367,7 @@ namespace armral::fft { template armral_status create_plan(armral_fft_plan_t **p, int n, - armral_fft_direction_t dir) { + armral_fft_direction_t dir, bool allow_raders) { assert(p); // try and find a suitable decomposition, else give up. // we arbitrarily limit ourselves to four factors here, but there's @@ -365,7 +376,7 @@ armral_status create_plan(armral_fft_plan_t **p, int n, tmp_plan.n = n; tmp_plan.dir = dir; tmp_plan.num_levels = factorize( - n, dir, armral_fft_plan_t::max_levels, tmp_plan.levels); + n, dir, armral_fft_plan_t::max_levels, tmp_plan.levels, allow_raders); if (tmp_plan.num_levels == 0) { return ARMRAL_ARGUMENT_ERROR; } @@ -378,10 +389,12 @@ armral_status create_plan(armral_fft_plan_t **p, int n, template armral_status create_plan( - armral_fft_plan_t **p, int n, armral_fft_direction_t dir); + armral_fft_plan_t **p, int n, armral_fft_direction_t dir, + bool allow_raders); template armral_status create_plan( - armral_fft_plan_t **p, int n, armral_fft_direction_t dir); + armral_fft_plan_t **p, int n, armral_fft_direction_t dir, + bool allow_raders); armral_status destroy_plan(armral_fft_plan_t **p) { assert(p); diff --git a/src/LowerPHY/FFT/fft_plan.hpp b/src/LowerPHY/FFT/fft_plan.hpp index b793eb2a16aa8022cc71e0b5ff7b736e02021ba0..9141d436379b01ebbd09116a43681bc4aacdbccd 100644 --- a/src/LowerPHY/FFT/fft_plan.hpp +++ b/src/LowerPHY/FFT/fft_plan.hpp @@ -11,17 +11,18 @@ namespace armral::fft { /** * Creates a plan for solving FFTs. Depending on the data type, the * plan will execute different functions. - * @tparam Tx Input data type - * @tparam Ty Output data type - * @tparam Tw Working data type - * @param [out] p Pointer to populate with the created FFT plan. - * @param [in] n The overall size of the FFT to perform. - * @param [in] dir The direction of the FFT (forwards or backwards). + * @tparam Tx Input data type + * @tparam Ty Output data type + * @tparam Tw Working data type + * @param [out] p Pointer to populate with the created FFT plan. + * @param [in] n The overall size of the FFT to perform. + * @param [in] dir The direction of the FFT (forwards or backwards). + * @param [in] allow_raders Allow use of Rader's algorithm. * @returns ARMRAL_SUCCESS if a plan is successfully created. */ template armral_status create_plan(armral_fft_plan_t **p, int n, - armral_fft_direction_t dir); + armral_fft_direction_t dir, bool allow_raders); /** * Common code for destroying a plan. For the time being, the plan is identical diff --git a/src/LowerPHY/FFT/rader.cpp b/src/LowerPHY/FFT/rader.cpp index 70dd18fe1e65f80eb05b1c4587be87c45df03c25..efa38bcc2d49846195df9b11c592f095374ad527 100644 --- a/src/LowerPHY/FFT/rader.cpp +++ b/src/LowerPHY/FFT/rader.cpp @@ -7,9 +7,7 @@ #include "fft_execute.hpp" #include "rader_generator.hpp" -#include #include -#include #ifndef M_PI #define M_PI 3.14159265358979323846 @@ -18,12 +16,12 @@ namespace armral::fft { template -rader make_rader(int n, armral_fft_direction_t dir) { +std::optional> make_rader(int n, armral_fft_direction_t dir) { using real_t = armral::fft::real_t; auto g = find_group_generator(n); if (!g) { - return {}; + return std::nullopt; } auto g_inv = find_inverse_mod_n(g, n); @@ -32,24 +30,24 @@ rader make_rader(int n, armral_fft_direction_t dir) { armral_fft_plan_t *pf = nullptr; armral_fft_plan_t *pb = nullptr; armral::fft::create_plan( - &pf, n - 1, armral_fft_direction_t::ARMRAL_FFT_FORWARDS); + &pf, n - 1, armral_fft_direction_t::ARMRAL_FFT_FORWARDS, false); armral::fft::create_plan( - &pb, n - 1, armral_fft_direction_t::ARMRAL_FFT_BACKWARDS); + &pb, n - 1, armral_fft_direction_t::ARMRAL_FFT_BACKWARDS, false); if (!pf || !pb) { if (pf) { armral::fft::destroy_plan(&pf); } else if (pb) { armral::fft::destroy_plan(&pb); } - return {}; + return std::nullopt; } // fill out permutation vectors to avoid needing to do expensive // modulus operations in the actual execute call. - int *gmul_fw_perm = (int *)malloc((n - 1) * sizeof(int)); - int *gmul_bw_perm = (int *)malloc((n - 1) * sizeof(int)); - int *ginvmul_fw_perm = (int *)malloc((n - 1) * sizeof(int)); - int *ginvmul_bw_perm = (int *)malloc((n - 1) * sizeof(int)); + int *gmul_fw_perm = static_cast(malloc((n - 1) * sizeof(int))); + int *gmul_bw_perm = static_cast(malloc((n - 1) * sizeof(int))); + int *ginvmul_fw_perm = static_cast(malloc((n - 1) * sizeof(int))); + int *ginvmul_bw_perm = static_cast(malloc((n - 1) * sizeof(int))); for (int j = 1, gmul = 1, ginvmul = 1; j < n; ++j) { gmul_fw_perm[j - 1] = gmul; ginvmul_fw_perm[j - 1] = ginvmul; @@ -60,7 +58,7 @@ rader make_rader(int n, armral_fft_direction_t dir) { } // Populate the vector b to be used in the convolution - Tw *b = (Tw *)malloc((n - 1) * sizeof(Tw)); + Tw *b = static_cast(malloc((n - 1) * sizeof(Tw))); double dir_float = dir == armral_fft_direction_t::ARMRAL_FFT_FORWARDS ? -1.0 : 1.0; for (int i = 0; i < n - 1; i++) { @@ -89,13 +87,17 @@ rader make_rader(int n, armral_fft_direction_t dir) { ginvmul_bw_perm}; } -template rader +template std::optional< + rader> make_rader(int n, armral_fft_direction_t dir); -template rader +template std::optional< + rader> make_rader(int n, armral_fft_direction_t dir); -template rader +template std::optional< + rader> make_rader(int n, armral_fft_direction_t dir); -template rader +template std::optional< + rader> make_rader(int n, armral_fft_direction_t dir); template struct rader &r, const Tx *x, Ty *y, int istride, // TODO: Use a statically allocated array rather than calling // malloc every time auto nm1 = r.n - 1; - auto *work_ptr = (Tw *)malloc(howmany * nm1 * sizeof(Tw)); - auto *x0 = (Tw *)malloc(howmany * sizeof(Tw)); - auto *y0 = (Tw *)malloc(howmany * sizeof(Tw)); + auto *work_ptr = static_cast(malloc(howmany * nm1 * sizeof(Tw))); + auto *x0 = static_cast(malloc(howmany * sizeof(Tw))); + auto *y0 = static_cast(malloc(howmany * sizeof(Tw))); rader_init(r, x, x0, y0, istride, w, howmany, idist, work_ptr); diff --git a/src/LowerPHY/FFT/rader.hpp b/src/LowerPHY/FFT/rader.hpp index bbe53a128b94062775850396790430acadcbe0f8..89b006c0a9a6049bffc61d4a1b4844f340ad6a03 100644 --- a/src/LowerPHY/FFT/rader.hpp +++ b/src/LowerPHY/FFT/rader.hpp @@ -5,9 +5,8 @@ #pragma once #include "fft_plan.hpp" -#include "fft_types.hpp" -#include +#include #ifdef ARMRAL_SEMIHOSTING #define M_PI 3.14159265358979323846 @@ -85,7 +84,7 @@ struct rader { }; template -rader make_rader(int n, armral_fft_direction_t dir); +std::optional> make_rader(int n, armral_fft_direction_t dir); template void execute_rader(const rader &r, const Tx *x, Ty *y, int istride, diff --git a/src/MatrixFactorizations/SVD/arm_svd.cpp b/src/MatrixFactorizations/SVD/arm_svd.cpp index e155b69f0fd2e5403efcaf9d046c3b2c91bf8d1c..d7086cbd67bc7bd204ad345144f70a5e929fbc87 100644 --- a/src/MatrixFactorizations/SVD/arm_svd.cpp +++ b/src/MatrixFactorizations/SVD/arm_svd.cpp @@ -201,12 +201,6 @@ inline void cmplx_axmy_f32(uint32_t n, const armral_cmplx_f32_t *p_src_a, // Store real and imaginary result in destination buffer. p_src_c->re = real_sum; p_src_c->im = imag_sum; - - p_src_a++; - p_src_c++; - - // Decrement loop counter - blk_cnt--; } } diff --git a/src/MatrixFactorizations/SVD/matrix_view.hpp b/src/MatrixFactorizations/SVD/matrix_view.hpp index 49eaf7d81ae5396a64c54374a3751e17f5c91875..36c7f638fc27d184c2d4043c579ef211267fee4e 100644 --- a/src/MatrixFactorizations/SVD/matrix_view.hpp +++ b/src/MatrixFactorizations/SVD/matrix_view.hpp @@ -8,7 +8,8 @@ #include /* - A non-owning column major view of a matrix to provide more convenient indexing. + A non-owning column major view of a matrix to provide more convenient + indexing. */ template struct column_major_matrix_view { diff --git a/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp b/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp index bc5d238d42be71e12517daece888fd0bc2e06334..76476b45e66bee88ff5e928b9fc71d3b1b9204a3 100644 --- a/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp +++ b/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp @@ -42,8 +42,8 @@ void compute_path(uint8_t *dec, uint32_t k, uint8_t states, uint8_t const *prev, int cmp(const void *a, const void *b) { int ret; - const pm_s ia = *(const pm_s *)a; - const pm_s ib = *(const pm_s *)b; + const pm_s ia = *static_cast(a); + const pm_s ib = *static_cast(b); if (ia.pm < ib.pm) { ret = -1; diff --git a/src/UpperPHY/LDPC/ldpc_encoder.cpp b/src/UpperPHY/LDPC/ldpc_encoder.cpp index 951a1678718d62a52feb97628b0a4fee194f63a0..5e97e351eb1ea777b0679613d65e006d6cafcae0 100644 --- a/src/UpperPHY/LDPC/ldpc_encoder.cpp +++ b/src/UpperPHY/LDPC/ldpc_encoder.cpp @@ -1204,10 +1204,6 @@ inline void set_parity_hdsm_bg1_lsi_6(uint32_t z, const uint8_t *parity_hdsm, svst1_u8(pg_tail, data_out + 23 * z, result23); svst1_u8(pg_tail, data_out + 24 * z, result24); svst1_u8(pg_tail, data_out + 25 * z, result25); - // Increment pointers - ptr_agg += tail_size; - ptr_hdsm += tail_size; - data_out += tail_size; } } else { // z != 208 @@ -1269,10 +1265,6 @@ inline void set_parity_hdsm_bg1_lsi_6(uint32_t z, const uint8_t *parity_hdsm, svst1_u8(pg_tail, data_out + 23 * z, result23); svst1_u8(pg_tail, data_out + 24 * z, result24); svst1_u8(pg_tail, data_out + 25 * z, result25); - // Increment pointers - ptr_agg += tail_size; - ptr_hdsm += tail_size; - data_out += tail_size; } } #else @@ -2402,7 +2394,8 @@ armral_status encode_block(const uint8_t *data_in, armral_ldpc_graph_t bg, return ARMRAL_SUCCESS; } - // Cast the bits to bytes for easier handling of data, ignore filler bits if present + // Cast the bits to bytes for easier handling of data, + // ignore filler bits if present bits_to_bytes(z * graph->nmessage_bits - len_filler_bits, data_in, bytes_in.get()); diff --git a/src/UpperPHY/Modulation/arm_modulation.c b/src/UpperPHY/Modulation/arm_modulation.c index 242acb4e9824ae0a456d87ea0432a8240326bb61..3049942e60e708268eebc0ef2b12891aa64ea8fc 100644 --- a/src/UpperPHY/Modulation/arm_modulation.c +++ b/src/UpperPHY/Modulation/arm_modulation.c @@ -7,7 +7,7 @@ #include #ifdef ARMRAL_ARCH_SVE -#include "arm_sve.h" +#include #endif /* Definition of the constellation map according to 3GPP specification. @@ -668,7 +668,6 @@ void armral_64qam_modulation(const uint32_t nbits, const uint8_t *p_src, point = constellation_64qam[index2]; *p_dst = point; - p_dst++; } } } @@ -804,7 +803,6 @@ void armral_256qam_modulation(const uint32_t nbits, const uint8_t *p_src, svint32_t gather = svld1_gather_index(pred, (const int32_t *)constellation_256qam, index); svst1_s32(pred, (int32_t *)p_dst, gather); - p_dst += leftover_bytes; } #else /* Compute the blocks which will be processed using loop unroll */ diff --git a/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp b/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp index f670c9a116d620f9dcef9573bd9692de0ed5b073..5b3767fd5ced4af837d5637f4e71f23e937ad7a5 100644 --- a/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp +++ b/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp @@ -1111,7 +1111,7 @@ polar_frozen_mask_impl_puncturing(uint32_t e, uint32_t k, uint8_t *frozen) { // puncturing auto qf_tmp = [&](uint32_t q0ni, uint32_t i) -> bool { - return (e <= arrs->qb[i]) | (q0ni < limit); + return (e <= arrs->qb[i]) || (q0ni < limit); }; // most reliable indices are at the start of Q0_n, so find the first diff --git a/src/UpperPHY/Turbo/arm_turbo_decoder.cpp b/src/UpperPHY/Turbo/arm_turbo_decoder.cpp index e9b89ec43972554236b4266acb115887cc7c8be8..cc380fd355add6eeb11495ce0c01c344bd424c9a 100644 --- a/src/UpperPHY/Turbo/arm_turbo_decoder.cpp +++ b/src/UpperPHY/Turbo/arm_turbo_decoder.cpp @@ -12,17 +12,24 @@ template void armral::turbo::decode_block( const int8_t *sys, const int8_t *par, const int8_t *itl, uint32_t k, - uint8_t *dst, float32_t l_c, uint32_t max_iter, heap_allocator &); + uint8_t *dst, float32_t l_c, uint32_t max_iter, uint16_t *perm_idxs, + heap_allocator &); template void armral::turbo::decode_block( const int8_t *sys, const int8_t *par, const int8_t *itl, uint32_t k, - uint8_t *dst, float32_t l_c, uint32_t max_iter, buffer_bump_allocator &); + uint8_t *dst, float32_t l_c, uint32_t max_iter, uint16_t *perm_idxs, + buffer_bump_allocator &); + +armral_status armral_turbo_perm_idx_init(uint16_t *buffer) { + return armral::turbo::all_perm_idx_init( + (armral::turbo::perm_idx_lookup *)buffer); +} template -static armral_status turbo_decode_block(const int8_t *sys, const int8_t *par, - const int8_t *itl, uint32_t k, - uint8_t *dst, uint32_t max_iter, - Allocator &allocator) { +static armral_status +turbo_decode_block(const int8_t *sys, const int8_t *par, const int8_t *itl, + uint32_t k, uint8_t *dst, uint32_t max_iter, + uint16_t *perm_idxs, Allocator &allocator) { if (!armral::turbo::valid_num_bits(k)) { return ARMRAL_ARGUMENT_ERROR; } @@ -33,30 +40,31 @@ static armral_status turbo_decode_block(const int8_t *sys, const int8_t *par, // N. Wehn, "Turbo-decoding without SNR estimation", IEEE Communications // Letters 4(6), pp. 193-195, July 2000. armral::turbo::decode_block(sys, par, itl, k, dst, 2.F, max_iter, - allocator); + perm_idxs, allocator); return ARMRAL_SUCCESS; } armral_status armral_turbo_decode_block(const int8_t *sys, const int8_t *par, const int8_t *itl, uint32_t k, - uint8_t *dst, uint32_t max_iter) { + uint8_t *dst, uint32_t max_iter, + uint16_t *perm_idxs) { heap_allocator allocator{}; - return turbo_decode_block(sys, par, itl, k, dst, max_iter, allocator); + return turbo_decode_block(sys, par, itl, k, dst, max_iter, perm_idxs, + allocator); } -armral_status armral_turbo_decode_block_noalloc(const int8_t *sys, - const int8_t *par, - const int8_t *itl, uint32_t k, - uint8_t *dst, uint32_t max_iter, - void *buffer) { +armral_status armral_turbo_decode_block_noalloc( + const int8_t *sys, const int8_t *par, const int8_t *itl, uint32_t k, + uint8_t *dst, uint32_t max_iter, uint16_t *perm_idxs, void *buffer) { buffer_bump_allocator allocator{buffer}; - return turbo_decode_block(sys, par, itl, k, dst, max_iter, allocator); + return turbo_decode_block(sys, par, itl, k, dst, max_iter, perm_idxs, + allocator); } uint32_t armral_turbo_decode_block_noalloc_buffer_size(uint32_t k, uint32_t max_iter) { counting_allocator allocator{}; (void)turbo_decode_block(nullptr, nullptr, nullptr, k, nullptr, max_iter, - allocator); + nullptr, allocator); return allocator.required_bytes(); } diff --git a/src/UpperPHY/Turbo/arm_turbo_decoder.hpp b/src/UpperPHY/Turbo/arm_turbo_decoder.hpp index dcd60a8b3afab8971d5887e29e75f129a6d97deb..f2e3012b2f606540b5925e091c9a402ec9829aa1 100644 --- a/src/UpperPHY/Turbo/arm_turbo_decoder.hpp +++ b/src/UpperPHY/Turbo/arm_turbo_decoder.hpp @@ -293,7 +293,7 @@ template void armral::turbo::decode_block(const int8_t *sys, const int8_t *par, const int8_t *itl, uint32_t k, uint8_t *dst, float32_t l_c, uint32_t max_iter, - Allocator &allocator) { + uint16_t *perm_idxs, Allocator &allocator) { // This implements multiple steps of the max-log-MAP algorithm, // which is an approximation to the MAP (BCJR) algorithm. It returns // a hard decision rather than raw LLRs. @@ -307,16 +307,8 @@ void armral::turbo::decode_block(const int8_t *sys, const int8_t *par, auto par_s16 = allocate_uninitialized(allocator, k8 + 1); auto itl_s16 = allocate_uninitialized(allocator, k8 + 1); - auto perm_idx = allocate_uninitialized(allocator, k); auto perm_sys = allocate_uninitialized(allocator, k8 + 1); - struct perm_pair { - uint16_t first; - uint16_t second; - }; - - auto perm_lookup = allocate_uninitialized(allocator, k); - // Allocate space to hold the extrinsic and permuted extrinsic information // to be passed between the two decoders. Extrinsic is initially set to 0. auto extrinsic = allocate_zeroed(allocator, k8); @@ -337,9 +329,33 @@ void armral::turbo::decode_block(const int8_t *sys, const int8_t *par, // k+3 bits to decode auto gamma = allocate_uninitialized(allocator, k8); - // NOTE: All allocations done. - if constexpr (Allocator::is_counting) { - return; + // Get the permutation vector for the input value of k. + // declare unique_ptr here to keep the allocated memory's scope outside the + // else block + unique_ptr perm_lookup_unique; + perm_idx_lookup *perm_lookup = nullptr; + // Find the index into the array of parameter arrays corresponding + // to the current k. Subtract 40 because k=40 is the lowest value. + uint32_t param_idx = armral::turbo::perm_params_lookup[(k - 40) >> 3]; + if (perm_idxs != NULL) { + // NOTE: All allocations done. + if constexpr (Allocator::is_counting) { + return; + } + perm_lookup = (perm_idx_lookup *)perm_idxs + + armral::turbo::perm_lookup_offset[param_idx]; + } else { + perm_lookup_unique = allocate_uninitialized(allocator, k); + + // NOTE: All allocations done. + if constexpr (Allocator::is_counting) { + return; + } + + perm_lookup = perm_lookup_unique.get(); + + // Generate the permutation vector for the input value of k. + armral::turbo::k_perm_idx_init(k, param_idx, perm_lookup); } // Convert our LLRs from int8_ts into int16_ts @@ -371,38 +387,17 @@ void armral::turbo::decode_block(const int8_t *sys, const int8_t *par, // Prescale l_c to avoid doing it repeatedly in the PDF calculations later const int16x8_t channel_reliability = vdupq_n_s16((int16_t)l_c / 2); - // Generate the permutation vector for the input value of k - // Find the index into the array of parameter arrays corresponding - // to the current k. Subtract 40 because k=40 is the lowest value. - int param_idx = armral::turbo::perm_params_lookup[(k - 40) >> 3]; - // and extract the correct values of f1 and f2 to build the - // interleaving polynomial - uint16_t f1 = armral::turbo::perm_params[param_idx][0]; - uint16_t f2 = armral::turbo::perm_params[param_idx][1]; - for (uint32_t i = 0; i < k; i++) { - perm_idx[i] = generate_perm_idx(i, f1, f2, k); - } - // Create a permuted version of the systematic output for use // with the second decoder for (uint32_t i = 0; i < k8; i++) { for (uint32_t j = 0; j < 8; j++) { - perm_sys[i][j] = (int16_t)sys[perm_idx[(i * 8) + j]]; + perm_sys[i][j] = (int16_t)sys[perm_lookup[(i * 8) + j].perm_idx]; } } perm_sys[k8][0] = (int16_t)sys[k + 2]; perm_sys[k8][1] = (int16_t)itl[k + 2]; perm_sys[k8][2] = (int16_t)par[k + 3]; - // Create a look-up of the permutation vector that maps [0,...k-1] indices - // to vector element/vector lane pairs. This avoids having to a modulo - // operator every time we want to apply the permutation to vector elements. - for (uint32_t i = 0; i < k; i++) { - uint16_t vec_idx = perm_idx[i] / 8; - uint16_t vec_lane = perm_idx[i] % 8; - perm_lookup[i] = perm_pair{vec_idx, vec_lane}; - } - // Initialize alpha alpha[0] = vdupq_n_s16(std::numeric_limits::min()); alpha[0][0] = 0; @@ -428,8 +423,8 @@ void armral::turbo::decode_block(const int8_t *sys, const int8_t *par, // Need to unpermute extrinsic to match input to second decoder for (uint32_t i = 0; i < k8; i++) { for (uint32_t j = 0; j < 8; j++) { - perm_extrinsic[i][j] = extrinsic[perm_lookup[i * 8 + j].first] - [perm_lookup[i * 8 + j].second]; + perm_extrinsic[i][j] = extrinsic[perm_lookup[i * 8 + j].vec_idx] + [perm_lookup[i * 8 + j].vec_lane]; } } @@ -444,8 +439,8 @@ void armral::turbo::decode_block(const int8_t *sys, const int8_t *par, // But need to unpermute extrinsic first for (uint32_t i = 0; i < k8; i++) { for (uint32_t j = 0; j < 8; j++) { - extrinsic[perm_lookup[i * 8 + j].first][perm_lookup[i * 8 + j].second] = - perm_extrinsic[i][j]; + extrinsic[perm_lookup[i * 8 + j].vec_idx] + [perm_lookup[i * 8 + j].vec_lane] = perm_extrinsic[i][j]; } } @@ -479,7 +474,7 @@ void armral::turbo::decode_block(const int8_t *sys, const int8_t *par, // Rather than allocate another new vector, copy into l1_uky and return that for (uint32_t i = 0; i < k8; i++) { for (uint32_t j = 0; j < 8; j++) { - l1_uky[perm_lookup[i * 8 + j].first][perm_lookup[i * 8 + j].second] = + l1_uky[perm_lookup[i * 8 + j].vec_idx][perm_lookup[i * 8 + j].vec_lane] = l2_uky[i][j]; } } diff --git a/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp b/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp index 25b3ede9dfb69e19d2c6b0295820cc84f37f397c..beac775f134a2d1487cac429a48a4581080f3ddb 100644 --- a/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp +++ b/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp @@ -95,7 +95,7 @@ static void subblock_interleave(uint32_t d, uint32_t kw, const uint8_t *d0, } // Permuted first row of dummy bit matrix is known. So, we can use - // a look up table. + // a look up table constexpr uint8_t dummy_table[32][4] = { {0x00, 0x00, 0x00, 0x00}, {0x80, 0x00, 0x00, 0x00}, {0x80, 0x00, 0x80, 0x00}, {0x80, 0x80, 0x80, 0x00}, @@ -181,6 +181,10 @@ static void bit_collection(uint32_t kpi, const uint8_t *v0, const uint8_t *v1, static void bit_selection(uint32_t kw, uint32_t ncb, uint32_t k0, uint32_t e, const uint8_t *w, const uint8_t *dummy, uint8_t *out) { + // This condition is implied in rate_matching() when this function is called + // so check that it is actually true on entry + assert(ncb == kw); + memset((void *)out, 0, sizeof(uint8_t) * (e + 7) / 8); uint32_t k = 0; uint32_t j = 0; @@ -209,7 +213,7 @@ armral_status rate_matching(uint32_t d, uint32_t e, uint32_t rv, assert(e > 0); assert(rv <= 3); - // The minimum number of rows which gives rtc * ctc >= d. + // The minimum number of rows which gives rtc * ctc >= d const uint32_t rtc = (d + armral::turbo::ctc - 1) / armral::turbo::ctc; const uint32_t kpi = rtc * armral::turbo::ctc; const uint32_t kw = 3 * kpi; @@ -232,12 +236,11 @@ armral_status rate_matching(uint32_t d, uint32_t e, uint32_t rv, return ARMRAL_SUCCESS; } - // Assume N_cb = k_w. + // Assume N_cb = k_w const uint32_t ncb = kw; - // Calculate k0 with the assumption N_cb = k_w. + // Calculate k0 with the assumption N_cb = k_w // k0 = rtc * (2 * N_cb/(8 * rtc) * rv + 2), with N_cb = kw = 3 * ctc * rtc - assert(ncb == kw); const uint32_t k0 = rtc * (24 * rv + 2); subblock_interleave(d, kw, src0, src1, src2, v0.get(), v1.get(), v2.get(), diff --git a/src/UpperPHY/Turbo/turbo_code.hpp b/src/UpperPHY/Turbo/turbo_code.hpp index 2dd3ff24369f3e82fdb8543b3360755400a75156..6d56f1d51029753c908f6ab25d4d959abb78528a 100644 --- a/src/UpperPHY/Turbo/turbo_code.hpp +++ b/src/UpperPHY/Turbo/turbo_code.hpp @@ -5,6 +5,7 @@ #pragma once #include "armral.h" +#include "turbo_tables.hpp" namespace armral::turbo { @@ -37,6 +38,51 @@ inline uint16_t generate_perm_idx(uint32_t i, uint16_t f1, uint16_t f2, return static_cast((uint64_t(f1) * i + uint64_t(f2) * i * i) % k); } +struct perm_idx_lookup { + uint16_t perm_idx; + uint16_t vec_idx; + uint16_t vec_lane; +}; + +inline void k_perm_idx_init(uint16_t k, uint16_t k_idx, + perm_idx_lookup *perm_idxs) { + // Extract the correct values of f1 and f2 to build the + // interleaving polynomial + uint16_t f1 = armral::turbo::perm_params[k_idx][0]; + uint16_t f2 = armral::turbo::perm_params[k_idx][1]; + // Generate the permutation vector for the input value of k. + for (uint16_t i = 0; i < k; ++i) { + uint16_t perm_idx = generate_perm_idx(i, f1, f2, k); + perm_idxs[i].perm_idx = perm_idx; + perm_idxs[i].vec_idx = perm_idx / 8; + perm_idxs[i].vec_lane = perm_idx % 8; + } +} + +inline armral_status all_perm_idx_init(perm_idx_lookup *buffer) { + + uint16_t k = 40; + uint16_t k_idx = 0; + for (; k < 512; k += 8, ++k_idx) { + k_perm_idx_init(k, k_idx, + buffer + armral::turbo::perm_lookup_offset[k_idx]); + } + for (; k < 1024; k += 16, ++k_idx) { + k_perm_idx_init(k, k_idx, + buffer + armral::turbo::perm_lookup_offset[k_idx]); + } + for (; k < 2048; k += 32, ++k_idx) { + k_perm_idx_init(k, k_idx, + buffer + armral::turbo::perm_lookup_offset[k_idx]); + } + for (; k <= 6144; k += 64, ++k_idx) { + k_perm_idx_init(k, k_idx, + buffer + armral::turbo::perm_lookup_offset[k_idx]); + } + + return ARMRAL_SUCCESS; +} + // An "expert" interface for Turbo decoding a single block. It allows the user // to specify a channel reliability measure L_c, which should be computed as: // @@ -53,6 +99,6 @@ inline uint16_t generate_perm_idx(uint32_t i, uint16_t f1, uint16_t f2, template void decode_block(const int8_t *sys, const int8_t *par, const int8_t *itl, uint32_t k, uint8_t *dst, float32_t l_c, uint32_t max_iter, - Allocator &allocator); + uint16_t *perm_idxs, Allocator &allocator); } // namespace armral::turbo diff --git a/src/UpperPHY/Turbo/turbo_tables.hpp b/src/UpperPHY/Turbo/turbo_tables.hpp index f2de1e22460f20daabc061038f5060c9ec89bb0b..6fb5d8d94b7750193a7950242a629b6375e3f1fc 100644 --- a/src/UpperPHY/Turbo/turbo_tables.hpp +++ b/src/UpperPHY/Turbo/turbo_tables.hpp @@ -147,6 +147,32 @@ static constexpr int16_t perm_params_lookup[764] = { -1, -1, -1, 184, -1, -1, -1, -1, -1, -1, -1, 185, -1, -1, -1, -1, -1, -1, -1, 186, -1, -1, -1, -1, -1, -1, -1, 187}; +// A table to get the offset to a specific K's permutation indices from the all +// k indices array created by armral_turbo_perm_idx_init(). Usage: +// perm_idx_lookup *k_perm_idxs = all_k_perm_idxs + perm_lookup_offset[perm_params_lookup[k/8-5]] +static constexpr uint32_t perm_lookup_offset[188] = { + 0, 40, 88, 144, 208, 280, 360, 448, 544, + 648, 760, 880, 1008, 1144, 1288, 1440, 1600, 1768, + 1944, 2128, 2320, 2520, 2728, 2944, 3168, 3400, 3640, + 3888, 4144, 4408, 4680, 4960, 5248, 5544, 5848, 6160, + 6480, 6808, 7144, 7488, 7840, 8200, 8568, 8944, 9328, + 9720, 10120, 10528, 10944, 11368, 11800, 12240, 12688, 13144, + 13608, 14080, 14560, 15048, 15544, 16048, 16560, 17088, 17632, + 18192, 18768, 19360, 19968, 20592, 21232, 21888, 22560, 23248, + 23952, 24672, 25408, 26160, 26928, 27712, 28512, 29328, 30160, + 31008, 31872, 32752, 33648, 34560, 35488, 36432, 37392, 38368, + 39360, 40368, 41392, 42448, 43536, 44656, 45808, 46992, 48208, + 49456, 50736, 52048, 53392, 54768, 56176, 57616, 59088, 60592, + 62128, 63696, 65296, 66928, 68592, 70288, 72016, 73776, 75568, + 77392, 79248, 81136, 83056, 85008, 86992, 89008, 91056, 93168, + 95344, 97584, 99888, 102256, 104688, 107184, 109744, 112368, 115056, + 117808, 120624, 123504, 126448, 129456, 132528, 135664, 138864, 142128, + 145456, 148848, 152304, 155824, 159408, 163056, 166768, 170544, 174384, + 178288, 182256, 186288, 190384, 194544, 198768, 203056, 207408, 211824, + 216304, 220848, 225456, 230128, 234864, 239664, 244528, 249456, 254448, + 259504, 264624, 269808, 275056, 280368, 285744, 291184, 296688, 302256, + 307888, 313584, 319344, 325168, 331056, 337008, 343024, 349104}; + // A table to find the output byte given an input byte static constexpr uint8_t encoded_bytes[2048] = { 0, 114, 203, 185, 229, 151, 46, 92, 1, 115, 202, 184, 228, 150, 47, diff --git a/src/utils/allocators.hpp b/src/utils/allocators.hpp index 1876dd20285c9386398c6e17aa777312f62f92ce..9bfa826b83b41c47e060b6a888fb6e38c6a694e4 100644 --- a/src/utils/allocators.hpp +++ b/src/utils/allocators.hpp @@ -87,7 +87,7 @@ static inline void *align_ptr(void *ptr, uint32_t align) { class buffer_bump_allocator : public base_allocator { public: - buffer_bump_allocator(void *buffer) : m_next_ptr(buffer) {} + explicit buffer_bump_allocator(void *buffer) : m_next_ptr(buffer) {} template T *allocate_uninitialized(size_t nitems) { diff --git a/test/BasicMathFun/MatrixMult/Batch/ArmSolve/main.cpp b/test/BasicMathFun/MatrixMult/Batch/ArmSolve/main.cpp index f7447aac8dda0f28bdbada97e4ce8f6f0f75a416..0a17a6299273a9e660e5725ae8bf92a070f13238 100644 --- a/test/BasicMathFun/MatrixMult/Batch/ArmSolve/main.cpp +++ b/test/BasicMathFun/MatrixMult/Batch/ArmSolve/main.cpp @@ -75,8 +75,8 @@ static armral_cmplx_int16_t convert_cs16_cf64(std::complex x, armral_fixed_point_index i) { int sh = (int)i; // number of decimal bits x *= (1 << sh); - armral::utils::qint64_t re = (int64_t)x.real(); - armral::utils::qint64_t im = (int64_t)x.imag(); + armral::utils::qint64_t re{(int64_t)x.real()}; + armral::utils::qint64_t im{(int64_t)x.imag()}; return {re.get16(), im.get16()}; } diff --git a/test/BasicMathFun/MatrixMult/Single/MatrixMult16/main.cpp b/test/BasicMathFun/MatrixMult/Single/MatrixMult16/main.cpp index 4dfadcd4a837d3da2154a6ed54ecf337846214ef..90a4e482a744c32500d2d47a647a675dbbabcb31 100644 --- a/test/BasicMathFun/MatrixMult/Single/MatrixMult16/main.cpp +++ b/test/BasicMathFun/MatrixMult/Single/MatrixMult16/main.cpp @@ -3,9 +3,13 @@ SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates */ #include "reference_linalg.hpp" +#include -static bool run_general_matmul_test_64(uint16_t m, uint16_t n, uint16_t k) { - const char *name = "MATMUL64 armral_cmplx_int16_t"; +namespace { +template +bool run_general_matmul_test_64( + char const *name, uint16_t m, uint16_t n, uint16_t k, + CmplxMatmulMatchingFunction cmplx_matmul_matching_under_test) { armral::utils::cs16_random random; const auto a = random.vector(m * k); const auto b = random.vector(k * n); @@ -16,12 +20,14 @@ static bool run_general_matmul_test_64(uint16_t m, uint16_t n, uint16_t k) { armral::utils::reference_matmul_cs16(m, n, k, a.data(), b.data(), ref.data(), 0); - armral_cmplx_matmul_i16(m, n, k, a.data(), b.data(), c.data()); + cmplx_matmul_matching_under_test(m, n, k, a.data(), b.data(), c.data()); return armral::utils::check_results_cs16(name, c.data(), ref.data(), m * n); } -static bool run_general_matmul_test_32(uint16_t m, uint16_t n, uint16_t k) { - const char *name = "MATMUL32 armral_cmplx_int16_t"; +template +bool run_general_matmul_test_32( + char const *name, uint16_t m, uint16_t n, uint16_t k, + CmplxMatmulMatchingFunction cmplx_matmul_matching_under_test) { // choose min/max values to avoid hitting saturation on the problems // we care about (m,n,k <= 16). constexpr armral_cmplx_int16_t min = {-4096, -4096}; @@ -36,26 +42,58 @@ static bool run_general_matmul_test_32(uint16_t m, uint16_t n, uint16_t k) { armral::utils::reference_matmul_cs16(m, n, k, a.data(), b.data(), ref.data(), 0); - armral_cmplx_matmul_i16_32bit(m, n, k, a.data(), b.data(), c.data()); + cmplx_matmul_matching_under_test(m, n, k, a.data(), b.data(), c.data()); return armral::utils::check_results_cs16(name, c.data(), ref.data(), m * n); } -// Entry point for unit testing for 16-bit matrix multiplication -int main(int argc, char **argv) { +template +bool run_all_tests_64bit( + char const *name, + CmplxMatmulMatchingFunction cmplx_matmul_matching_under_test) { bool passed = true; - for (unsigned m = 1; m <= 16; ++m) { - for (unsigned n = 1; n <= 16; ++n) { - for (unsigned k = 1; k <= 16; ++k) { - passed &= run_general_matmul_test_64(m, n, k); + for (uint16_t m = 1; m <= 16; ++m) { + for (uint16_t n = 1; n <= 16; ++n) { + for (uint16_t k = 1; k <= 16; ++k) { + passed &= run_general_matmul_test_64(name, m, n, k, + cmplx_matmul_matching_under_test); } } } - for (unsigned m = 1; m <= 16; ++m) { - for (unsigned n = 1; n <= 16; ++n) { - for (unsigned k = 1; k <= 16; ++k) { - passed &= run_general_matmul_test_32(m, n, k); + return passed; +} + +template +bool run_all_tests_32bit( + char const *name, + CmplxMatmulMatchingFunction cmplx_matmul_matching_under_test) { + bool passed = true; + for (uint16_t m = 1; m <= 16; ++m) { + for (uint16_t n = 1; n <= 16; ++n) { + for (uint16_t k = 1; k <= 16; ++k) { + passed &= run_general_matmul_test_32(name, m, n, k, + cmplx_matmul_matching_under_test); } } } + return passed; +} +} // anonymous namespace + +// Entry point for unit testing for 16-bit matrix multiplication +int main(int argc, char **argv) { + bool passed = true; + + passed &= run_all_tests_64bit("MATMUL64 armral_cmplx_int16_t", + armral_cmplx_matmul_i16); + passed &= run_all_tests_64bit( + "MATMUL64 armral_cmplx_int16_t NoAlloc", + [](uint16_t m, uint16_t n, uint16_t k, auto... args) { + std::vector buffer(k * n * sizeof(armral_cmplx_int16_t)); + return armral_cmplx_matmul_i16_noalloc(m, n, k, args..., buffer.data()); + }); + + passed &= run_all_tests_32bit("MATMUL32 armral_cmplx_int16_t", + armral_cmplx_matmul_i16_32bit); + exit(passed ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/test/LowerPHY/Correlation/main.cpp b/test/LowerPHY/Correlation/main.cpp index 1a908460c02a65f4d37fa45d1c8b977e83eae07c..4dff97e9f8d16e9045fccbed4ea7d26381dd32f3 100644 --- a/test/LowerPHY/Correlation/main.cpp +++ b/test/LowerPHY/Correlation/main.cpp @@ -43,18 +43,18 @@ static std::complex ref_sum_a_conj_b(int n, const armral_cmplx_int16_t *b) { std::complex acc; for (int i = 0; i < n; ++i) { - std::complex a_elem{a[i].re, a[i].im}; - std::complex b_elem{b[i].re, -b[i].im}; + std::complex a_elem{qint64_t{a[i].re}, qint64_t{a[i].im}}; + std::complex b_elem{qint64_t{b[i].re}, qint64_t{-b[i].im}}; acc += a_elem * b_elem; } return acc; } static qint64_t ref_sum_a_conj_a(int n, const armral_cmplx_int16_t *a) { - qint64_t acc = 0; + qint64_t acc{0}; for (int i = 0; i < n; ++i) { - std::complex a_elem{a[i].re, a[i].im}; - std::complex b_elem{a[i].re, -a[i].im}; + std::complex a_elem{qint64_t{a[i].re}, qint64_t{a[i].im}}; + std::complex b_elem{qint64_t{a[i].re}, qint64_t{-a[i].im}}; acc += (a_elem * b_elem).real(); } return acc; @@ -63,7 +63,7 @@ static qint64_t ref_sum_a_conj_a(int n, const armral_cmplx_int16_t *a) { static qint64_t ref_sqrt(qint64_t x) { double q33_30_to_fp = 1.0 / (32768.0 * 32768.0); double fp_to_q15 = 32768.0; - return (int64_t)(sqrt(x.get64() * q33_30_to_fp) * fp_to_q15 + 0.5); + return (qint64_t)(sqrt(x.get64() * q33_30_to_fp) * fp_to_q15 + 0.5); } static armral_cmplx_int16_t diff --git a/test/LowerPHY/FFT/FFT32/main.cpp b/test/LowerPHY/FFT/FFT32/main.cpp index e24a21c4660bdf6d3d3a7d0f1d8c4889d4dbbee9..12ab42733014843b2bd7f7a083831797855947b0 100644 --- a/test/LowerPHY/FFT/FFT32/main.cpp +++ b/test/LowerPHY/FFT/FFT32/main.cpp @@ -82,13 +82,10 @@ int main(int argc, char **argv) { constexpr int ns[] = { 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, - 32, 40, 46, 47, 62, 64, 65, 66, 68, 74, 77, 82, - 86, 99, 102, 106, 121, 134, 136, 142, 146, 169, 170, 194, - 202, 204, 206, 226, 238, 255, 272, 274, 289, 314, 326, 342, - 361, 383, 394, 440, 441, 484, 529, 542, 552, 578, 614, 626, - 706, 722, 758, 768, 800, 802, 838, 842, 880, 882, 926, 968, - 1024, 1063, 1104, 1202, 1366, 1728, 2002, 2013, 2025, 2030, 2048, 2128, - 2401, 3001, 3050, 3240, 3394}; + 32, 62, 106, 142, 202, 206, 226, 274, 314, 326, 394, 484, + 542, 573, 614, 626, 706, 758, 800, 802, 821, 838, 842, 926, + 968, 1024, 1063, 1198, 1202, 1366, 1728, 2013, 2025, 2030, 2128, 2209, + 2401, 2557, 3001, 3226, 3240, 3309, 3482, 3998, 4096}; for (int n : ns) { for (auto dir : {ARMRAL_FFT_FORWARDS, ARMRAL_FFT_BACKWARDS}) { passed &= run_fft_test(n, dir); diff --git a/test/UpperPHY/Demodulation/main.cpp b/test/UpperPHY/Demodulation/main.cpp index 642e5bd8f2d8700fc174e458e7b68a0b4a0483b4..c03563fdfa3825bb858ca88508673c1ada49d285 100644 --- a/test/UpperPHY/Demodulation/main.cpp +++ b/test/UpperPHY/Demodulation/main.cpp @@ -199,7 +199,7 @@ struct demod_test_params { /// Constructor from a set of parameters demod_test_params(armral_modulation_type m, unsigned bps, std::string n, std::vector ulps, std::vector ns, - demod_ref_func_t f) + const demod_ref_func_t &f) : mod_type(m), bits_per_symbol(bps), name(std::move(n)), llr_ulps(std::move(ulps)), num_symbols(std::move(ns)), ref_func(f) {} }; diff --git a/test/UpperPHY/LDPC/RateRecovery/main.cpp b/test/UpperPHY/LDPC/RateRecovery/main.cpp index 68e55f68c64895bc5a42c1dc9649d8540dba15f3..907115f24ffaffcba5ab607fd5341bd3c6307f9a 100644 --- a/test/UpperPHY/LDPC/RateRecovery/main.cpp +++ b/test/UpperPHY/LDPC/RateRecovery/main.cpp @@ -316,8 +316,8 @@ bool test_ldpc_rate_recovery( lifting_size_list, lifting_size_list + lifting_size_list_len); // Prepare nref_list for both BG1 and BG2 using lifting_size_list const uint32_t nref_list[2][3] = { - {0, (66 * lifting_size_min), (66 * lifting_size_max)}, //BG1 - {0, (50 * lifting_size_min), (50 * lifting_size_max)}, //BG2 + {0, (66 * lifting_size_min), (66 * lifting_size_max)}, // BG1 + {0, (50 * lifting_size_min), (50 * lifting_size_max)}, // BG2 }; const uint8_t rb_list[] = {1, 1, 1, 3, 4, 7, 24}; const uint32_t filler_bits_list[] = {0, 28, 36}; diff --git a/test/UpperPHY/Turbo/Decoding/main.cpp b/test/UpperPHY/Turbo/Decoding/main.cpp index af4d929785fc0465866480e0441ab96bc3d67a82..ece895d04d3f1d3bae7a5e355c4062d5fb97ffb8 100644 --- a/test/UpperPHY/Turbo/Decoding/main.cpp +++ b/test/UpperPHY/Turbo/Decoding/main.cpp @@ -15,7 +15,7 @@ // memory to the routine as the parameter test is the first thing // it does and it will return immediately when k is invalid. static bool run_turbo_decoding_parameter_test() { - return armral_turbo_decode_block(NULL, NULL, NULL, 1040, NULL, 0) == + return armral_turbo_decode_block(NULL, NULL, NULL, 1040, NULL, 0, NULL) == ARMRAL_ARGUMENT_ERROR; } @@ -23,7 +23,7 @@ static bool run_turbo_decoding_parameter_test() { // unencoded input for valid values of k. template static bool -run_turbo_decoding_test(char const *name, uint32_t k, +run_turbo_decoding_test(char const *name, uint32_t k, uint16_t *perm_idxs, TurboDecodeFunction turbo_decode_under_test) { auto k_bytes = k >> 3; @@ -68,7 +68,8 @@ run_turbo_decoding_test(char const *name, uint32_t k, // Decode the encoded data. We set the maximum number of decoder iterations to // 5, which in the absence of noise should always be more than enough. ret = turbo_decode_under_test(sys_demod_soft.data(), par_demod_soft.data(), - itl_demod_soft.data(), k, ans.data(), 5); + itl_demod_soft.data(), k, ans.data(), 5, + perm_idxs); bool passed = true; @@ -85,8 +86,8 @@ run_turbo_decoding_test(char const *name, uint32_t k, for (uint32_t i = 0; i < k_bytes; i++) { if (ans[i] != src[i]) { // GCOVR_EXCL_START - printf("Error! [%s_%u] result[0][%u] = 0x%x and expected[0][%u] = 0x%x\n", - name, k, i, ans[i], i, src[i]); + printf("Error! [%s_%u] result[%u] = 0x%x and expected[%u] = 0x%x\n", name, + k, i, ans[i], i, src[i]); passed = false; // GCOVR_EXCL_STOP } @@ -103,23 +104,41 @@ int main(int argc, char **argv) { // Check invalid k is detected passed &= run_turbo_decoding_parameter_test(); + // Initialize the buffer for the perm_idxs + uint32_t buff_size = 0; + for (auto k : valid_ks) { + buff_size += k; + } + buff_size *= 3; // perm_idx, vec_idx, and vec_lane + std::vector perm_idxs_buff(buff_size); + armral_turbo_perm_idx_init(perm_idxs_buff.data()); + // Check decoder decodes correctly for (auto k : valid_ks) { - passed &= - run_turbo_decoding_test("TurboDecoding", k, armral_turbo_decode_block); + passed &= run_turbo_decoding_test("TurboDecoding", k, perm_idxs_buff.data(), + armral_turbo_decode_block); + } + for (auto k : valid_ks) { + passed &= run_turbo_decoding_test("TurboDecodingNoPermIdxs", k, nullptr, + armral_turbo_decode_block); } + auto no_alloc_test = [](const int8_t *sys, const int8_t *par, + const int8_t *itl, uint32_t passed_k, uint8_t *dst, + uint32_t max_iter, uint16_t *perm_idxs) { + auto buffer_size = + armral_turbo_decode_block_noalloc_buffer_size(passed_k, max_iter); + std::vector buffer(buffer_size); + return armral_turbo_decode_block_noalloc( + sys, par, itl, passed_k, dst, max_iter, perm_idxs, buffer.data()); + }; + for (auto k : valid_ks) { + passed &= run_turbo_decoding_test("TurboDecodingNoAlloc", k, + perm_idxs_buff.data(), no_alloc_test); + } for (auto k : valid_ks) { - passed &= run_turbo_decoding_test( - "TurboDecodingNoAlloc", k, - [](const int8_t *sys, const int8_t *par, const int8_t *itl, - uint32_t passed_k, uint8_t *dst, uint32_t max_iter) { - auto buffer_size = - armral_turbo_decode_block_noalloc_buffer_size(passed_k, max_iter); - std::vector buffer(buffer_size); - return armral_turbo_decode_block_noalloc(sys, par, itl, passed_k, dst, - max_iter, buffer.data()); - }); + passed &= run_turbo_decoding_test("TurboDecodingNoAllocNoPermIdxs", k, + nullptr, no_alloc_test); } exit(passed ? EXIT_SUCCESS : EXIT_FAILURE); diff --git a/test/UpperPHY/Turbo/Encoding/main.cpp b/test/UpperPHY/Turbo/Encoding/main.cpp index 072218cfb4e688a3d589c4fbdc5ebed5cdd641a1..a5b109f048bb4d5d8abd02b6c5d758dbaf6dccf1 100644 --- a/test/UpperPHY/Turbo/Encoding/main.cpp +++ b/test/UpperPHY/Turbo/Encoding/main.cpp @@ -61,22 +61,25 @@ run_turbo_encoding_test(char const *name, uint32_t k, for (uint32_t i = 0; i < k_bytes + 1; i++) { if (sys[i] != sys_ref[i]) { // GCOVR_EXCL_START - printf("Error! [%s_%u] result[0][%u] = 0x%x and expected[0][%u] = 0x%x\n", - name, k, i, sys[i], i, sys_ref[i]); + printf( + "Error! [%s_%u] result_sys[%u] = 0x%x and expected_sys[%u] = 0x%x\n", + name, k, i, sys[i], i, sys_ref[i]); passed = false; // GCOVR_EXCL_STOP } if (par[i] != par_ref[i]) { // GCOVR_EXCL_START - printf("Error! [%s_%u] result[1][%u] = 0x%x and expected[1][%u] = 0x%x\n", - name, k, i, par[i], i, par_ref[i]); + printf( + "Error! [%s_%u] result_par[%u] = 0x%x and expected_par[%u] = 0x%x\n", + name, k, i, par[i], i, par_ref[i]); passed = false; // GCOVR_EXCL_STOP } if (itl[i] != itl_ref[i]) { // GCOVR_EXCL_START - printf("Error! [%s_%u] result[2][%u] = 0x%x and expected[2][%u] = 0x%x\n", - name, k, i, itl[i], i, itl_ref[i]); + printf( + "Error! [%s_%u] result_itl[%u] = 0x%x and expected_itl[%u] = 0x%x\n", + name, k, i, itl[i], i, itl_ref[i]); passed = false; // GCOVR_EXCL_STOP } diff --git a/test/UpperPHY/Turbo/Encoding/reference_turbo_encoder.hpp b/test/UpperPHY/Turbo/Encoding/reference_turbo_encoder.hpp index 451b6b2dd7fc73c1e9f4a86cea4cec7f45892ef5..d38c6c7b7444157eab442cd21bea4ec766178427 100644 --- a/test/UpperPHY/Turbo/Encoding/reference_turbo_encoder.hpp +++ b/test/UpperPHY/Turbo/Encoding/reference_turbo_encoder.hpp @@ -6,7 +6,6 @@ #include #include -#include #include // Precomputed encoded outputs from the RSC encoder indexed by the @@ -55,77 +54,8 @@ static constexpr uint8_t trellis_encoded_symbol[24] = { 0, 1, 0, 1, 1, 0, 1, 0, // i=0 0, 2, 0, 2, 2, 0, 2, 0, // i=1 0, 4, 0, 4, 4, 0, 4, 0}; // i=2 -// 0 1 2 3 4 5 6 7 trellis_state -// Internal interleaver parameters f1 and f2 for all valid values of K. -// These are used to compute the interleave polynomial: -// perm_idx = [ (f1*i + f2*i*i) % K for i in range(0, K)] -// The values of f1 and f2 are taken from the table in TS 36.212 -// Section 5.1.3.2.3. -static const std::map> perm_params = { - {40, {3, 10}}, {48, {7, 12}}, {56, {19, 42}}, - {64, {7, 16}}, {72, {7, 18}}, {80, {11, 20}}, - {88, {5, 22}}, {96, {11, 24}}, {104, {7, 26}}, - {112, {41, 84}}, {120, {103, 90}}, {128, {15, 32}}, - {136, {9, 34}}, {144, {17, 108}}, {152, {9, 38}}, - {160, {21, 120}}, {168, {101, 84}}, {176, {21, 44}}, - {184, {57, 46}}, {192, {23, 48}}, {200, {13, 50}}, - {208, {27, 52}}, {216, {11, 36}}, {224, {27, 56}}, - {232, {85, 58}}, {240, {29, 60}}, {248, {33, 62}}, - {256, {15, 32}}, {264, {17, 198}}, {272, {33, 68}}, - {280, {103, 210}}, {288, {19, 36}}, {296, {19, 74}}, - {304, {37, 76}}, {312, {19, 78}}, {320, {21, 120}}, - {328, {21, 82}}, {336, {115, 84}}, {344, {193, 86}}, - {352, {21, 44}}, {360, {133, 90}}, {368, {81, 46}}, - {376, {45, 94}}, {384, {23, 48}}, {392, {243, 98}}, - {400, {151, 40}}, {408, {155, 102}}, {416, {25, 52}}, - {424, {51, 106}}, {432, {47, 72}}, {440, {91, 110}}, - {448, {29, 168}}, {456, {29, 114}}, {464, {247, 58}}, - {472, {29, 118}}, {480, {89, 180}}, {488, {91, 122}}, - {496, {157, 62}}, {504, {55, 84}}, {512, {31, 64}}, - {528, {17, 66}}, {544, {35, 68}}, {560, {227, 420}}, - {576, {65, 96}}, {592, {19, 74}}, {608, {37, 76}}, - {624, {41, 234}}, {640, {39, 80}}, {656, {185, 82}}, - {672, {43, 252}}, {688, {21, 86}}, {704, {155, 44}}, - {720, {79, 120}}, {736, {139, 92}}, {752, {23, 94}}, - {768, {217, 48}}, {784, {25, 98}}, {800, {17, 80}}, - {816, {127, 102}}, {832, {25, 52}}, {848, {239, 106}}, - {864, {17, 48}}, {880, {137, 110}}, {896, {215, 112}}, - {912, {29, 114}}, {928, {15, 58}}, {944, {147, 118}}, - {960, {29, 60}}, {976, {59, 122}}, {992, {65, 124}}, - {1008, {55, 84}}, {1024, {31, 64}}, {1056, {17, 66}}, - {1088, {171, 204}}, {1120, {67, 140}}, {1152, {35, 72}}, - {1184, {19, 74}}, {1216, {39, 76}}, {1248, {19, 78}}, - {1280, {199, 240}}, {1312, {21, 82}}, {1344, {211, 252}}, - {1376, {21, 86}}, {1408, {43, 88}}, {1440, {149, 60}}, - {1472, {45, 92}}, {1504, {49, 846}}, {1536, {71, 48}}, - {1568, {13, 28}}, {1600, {17, 80}}, {1632, {25, 102}}, - {1664, {183, 104}}, {1696, {55, 954}}, {1728, {127, 96}}, - {1760, {27, 110}}, {1792, {29, 112}}, {1824, {29, 114}}, - {1856, {57, 116}}, {1888, {45, 354}}, {1920, {31, 120}}, - {1952, {59, 610}}, {1984, {185, 124}}, {2016, {113, 420}}, - {2048, {31, 64}}, {2112, {17, 66}}, {2176, {171, 136}}, - {2240, {209, 420}}, {2304, {253, 216}}, {2368, {367, 444}}, - {2432, {265, 456}}, {2496, {181, 468}}, {2560, {39, 80}}, - {2624, {27, 164}}, {2688, {127, 504}}, {2752, {143, 172}}, - {2816, {43, 88}}, {2880, {29, 300}}, {2944, {45, 92}}, - {3008, {157, 188}}, {3072, {47, 96}}, {3136, {13, 28}}, - {3200, {111, 240}}, {3264, {443, 204}}, {3328, {51, 104}}, - {3392, {51, 212}}, {3456, {451, 192}}, {3520, {257, 220}}, - {3584, {57, 336}}, {3648, {313, 228}}, {3712, {271, 232}}, - {3776, {179, 236}}, {3840, {331, 120}}, {3904, {363, 244}}, - {3968, {375, 248}}, {4032, {127, 168}}, {4096, {31, 64}}, - {4160, {33, 130}}, {4224, {43, 264}}, {4288, {33, 134}}, - {4352, {477, 408}}, {4416, {35, 138}}, {4480, {233, 280}}, - {4544, {357, 142}}, {4608, {337, 480}}, {4672, {37, 146}}, - {4736, {71, 444}}, {4800, {71, 120}}, {4864, {37, 152}}, - {4928, {39, 462}}, {4992, {127, 234}}, {5056, {39, 158}}, - {5120, {39, 80}}, {5184, {31, 96}}, {5248, {113, 902}}, - {5312, {41, 166}}, {5376, {251, 336}}, {5440, {43, 170}}, - {5504, {21, 86}}, {5568, {43, 174}}, {5632, {45, 176}}, - {5696, {45, 178}}, {5760, {161, 120}}, {5824, {89, 182}}, - {5888, {323, 184}}, {5952, {47, 186}}, {6016, {23, 94}}, - {6080, {47, 190}}, {6144, {263, 480}}}; +// 0 1 2 3 4 5 6 7 trellis_state // implements the recursive systematic convolutional (RSC) encoder // used in LTE turbo encoding to generate the parity bits diff --git a/test/UpperPHY/Turbo/PermIndices/main.cpp b/test/UpperPHY/Turbo/PermIndices/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..216be7fd4104d6ba17192a442f875c0823f4ef59 --- /dev/null +++ b/test/UpperPHY/Turbo/PermIndices/main.cpp @@ -0,0 +1,53 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +*/ +#include "armral.h" + +#include "../turbo_test_data.hpp" + +#include + +static bool run_perm_idxs_test(uint16_t *perm_idxs_buff) { + bool ret = true; + + uint16_t *perm_idx = (uint16_t *)perm_idxs_buff; + for (auto k : valid_ks) { + bool passed = true; + + for (uint16_t i = 0; i < k; ++i, perm_idx += 3) { + uint16_t f1 = perm_params.at(k).first; + uint16_t f2 = perm_params.at(k).second; + uint16_t test_perm_idx = + static_cast((uint64_t(f1) * i + uint64_t(f2) * i * i) % k); + passed &= test_perm_idx == *perm_idx; + passed &= test_perm_idx / 8 == *(perm_idx + 1); + passed &= test_perm_idx % 8 == *(perm_idx + 2); + } + + if (passed) { + printf("[SetupPermIdxs_%u] - check result: OK\n", k); + } else { + // GCOVR_EXCL_START + printf("Error! [SetupPermIdxs_%u]\n", k); + ret = false; + // GCOVR_EXCL_STOP + } + } + + return ret; +} + +int main(int argc, char **argv) { + uint32_t buff_size = 0; + for (auto k : valid_ks) { + buff_size += k; + } + buff_size *= 3; // perm_idx, vec_idx, and vec_lane + std::vector perm_idxs_buff(buff_size); + + armral_turbo_perm_idx_init(perm_idxs_buff.data()); + bool passed = run_perm_idxs_test(perm_idxs_buff.data()); + + exit(passed ? EXIT_SUCCESS : EXIT_FAILURE); +} diff --git a/test/UpperPHY/Turbo/turbo_test_data.hpp b/test/UpperPHY/Turbo/turbo_test_data.hpp index 4507b9bb4d56482b1cc1c29b677a3e1a82914371..cd1697df973018f684dd81fff2ffa47b66f2d0d3 100644 --- a/test/UpperPHY/Turbo/turbo_test_data.hpp +++ b/test/UpperPHY/Turbo/turbo_test_data.hpp @@ -6,7 +6,9 @@ #include "rng.hpp" -static void generate_turbo_test_data(uint8_t *src, uint32_t k) { +#include + +static inline void generate_turbo_test_data(uint8_t *src, uint32_t k) { static armral::utils::linear_congruential_generator lcg; auto state = armral::utils::random_state::from_seeds({k}); @@ -46,3 +48,73 @@ static constexpr uint32_t valid_ks[188] = { 4160, 4224, 4288, 4352, 4416, 4480, 4544, 4608, 4672, 4736, 4800, 4864, 4928, 4992, 5056, 5120, 5184, 5248, 5312, 5376, 5440, 5504, 5568, 5632, 5696, 5760, 5824, 5888, 5952, 6016, 6080, 6144}; + +// Internal interleaver parameters f1 and f2 for all valid values of K. +// These are used to compute the interleave polynomial: +// perm_idx = [ (f1*i + f2*i*i) % K for i in range(0, K) ] +// The values of f1 and f2 are taken from the table in TS 36.212 +// Section 5.1.3.2.3. +static const std::map> perm_params = { + {40, {3, 10}}, {48, {7, 12}}, {56, {19, 42}}, + {64, {7, 16}}, {72, {7, 18}}, {80, {11, 20}}, + {88, {5, 22}}, {96, {11, 24}}, {104, {7, 26}}, + {112, {41, 84}}, {120, {103, 90}}, {128, {15, 32}}, + {136, {9, 34}}, {144, {17, 108}}, {152, {9, 38}}, + {160, {21, 120}}, {168, {101, 84}}, {176, {21, 44}}, + {184, {57, 46}}, {192, {23, 48}}, {200, {13, 50}}, + {208, {27, 52}}, {216, {11, 36}}, {224, {27, 56}}, + {232, {85, 58}}, {240, {29, 60}}, {248, {33, 62}}, + {256, {15, 32}}, {264, {17, 198}}, {272, {33, 68}}, + {280, {103, 210}}, {288, {19, 36}}, {296, {19, 74}}, + {304, {37, 76}}, {312, {19, 78}}, {320, {21, 120}}, + {328, {21, 82}}, {336, {115, 84}}, {344, {193, 86}}, + {352, {21, 44}}, {360, {133, 90}}, {368, {81, 46}}, + {376, {45, 94}}, {384, {23, 48}}, {392, {243, 98}}, + {400, {151, 40}}, {408, {155, 102}}, {416, {25, 52}}, + {424, {51, 106}}, {432, {47, 72}}, {440, {91, 110}}, + {448, {29, 168}}, {456, {29, 114}}, {464, {247, 58}}, + {472, {29, 118}}, {480, {89, 180}}, {488, {91, 122}}, + {496, {157, 62}}, {504, {55, 84}}, {512, {31, 64}}, + {528, {17, 66}}, {544, {35, 68}}, {560, {227, 420}}, + {576, {65, 96}}, {592, {19, 74}}, {608, {37, 76}}, + {624, {41, 234}}, {640, {39, 80}}, {656, {185, 82}}, + {672, {43, 252}}, {688, {21, 86}}, {704, {155, 44}}, + {720, {79, 120}}, {736, {139, 92}}, {752, {23, 94}}, + {768, {217, 48}}, {784, {25, 98}}, {800, {17, 80}}, + {816, {127, 102}}, {832, {25, 52}}, {848, {239, 106}}, + {864, {17, 48}}, {880, {137, 110}}, {896, {215, 112}}, + {912, {29, 114}}, {928, {15, 58}}, {944, {147, 118}}, + {960, {29, 60}}, {976, {59, 122}}, {992, {65, 124}}, + {1008, {55, 84}}, {1024, {31, 64}}, {1056, {17, 66}}, + {1088, {171, 204}}, {1120, {67, 140}}, {1152, {35, 72}}, + {1184, {19, 74}}, {1216, {39, 76}}, {1248, {19, 78}}, + {1280, {199, 240}}, {1312, {21, 82}}, {1344, {211, 252}}, + {1376, {21, 86}}, {1408, {43, 88}}, {1440, {149, 60}}, + {1472, {45, 92}}, {1504, {49, 846}}, {1536, {71, 48}}, + {1568, {13, 28}}, {1600, {17, 80}}, {1632, {25, 102}}, + {1664, {183, 104}}, {1696, {55, 954}}, {1728, {127, 96}}, + {1760, {27, 110}}, {1792, {29, 112}}, {1824, {29, 114}}, + {1856, {57, 116}}, {1888, {45, 354}}, {1920, {31, 120}}, + {1952, {59, 610}}, {1984, {185, 124}}, {2016, {113, 420}}, + {2048, {31, 64}}, {2112, {17, 66}}, {2176, {171, 136}}, + {2240, {209, 420}}, {2304, {253, 216}}, {2368, {367, 444}}, + {2432, {265, 456}}, {2496, {181, 468}}, {2560, {39, 80}}, + {2624, {27, 164}}, {2688, {127, 504}}, {2752, {143, 172}}, + {2816, {43, 88}}, {2880, {29, 300}}, {2944, {45, 92}}, + {3008, {157, 188}}, {3072, {47, 96}}, {3136, {13, 28}}, + {3200, {111, 240}}, {3264, {443, 204}}, {3328, {51, 104}}, + {3392, {51, 212}}, {3456, {451, 192}}, {3520, {257, 220}}, + {3584, {57, 336}}, {3648, {313, 228}}, {3712, {271, 232}}, + {3776, {179, 236}}, {3840, {331, 120}}, {3904, {363, 244}}, + {3968, {375, 248}}, {4032, {127, 168}}, {4096, {31, 64}}, + {4160, {33, 130}}, {4224, {43, 264}}, {4288, {33, 134}}, + {4352, {477, 408}}, {4416, {35, 138}}, {4480, {233, 280}}, + {4544, {357, 142}}, {4608, {337, 480}}, {4672, {37, 146}}, + {4736, {71, 444}}, {4800, {71, 120}}, {4864, {37, 152}}, + {4928, {39, 462}}, {4992, {127, 234}}, {5056, {39, 158}}, + {5120, {39, 80}}, {5184, {31, 96}}, {5248, {113, 902}}, + {5312, {41, 166}}, {5376, {251, 336}}, {5440, {43, 170}}, + {5504, {21, 86}}, {5568, {43, 174}}, {5632, {45, 176}}, + {5696, {45, 178}}, {5760, {161, 120}}, {5824, {89, 182}}, + {5888, {323, 184}}, {5952, {47, 186}}, {6016, {23, 94}}, + {6080, {47, 190}}, {6144, {263, 480}}}; \ No newline at end of file diff --git a/utils/matrix_utils.hpp b/utils/matrix_utils.hpp index b12f622a790287362ec841a3b1f82238d737b81b..8a27b84c06b8c749700b53e0e588a2dbb50fd399 100644 --- a/utils/matrix_utils.hpp +++ b/utils/matrix_utils.hpp @@ -173,7 +173,6 @@ static bool check_results_mat_inv( const float32_t rel_tol_mult = 1.0F, const float32_t abs_tol_mult = 1.0F, int verbose = 0) { bool passed = true; - float32_t error = 0; float32_t max_error = 0; // TODO: arbitrarily chosen constant. we should probably do better than this, // but until we actually talk to people and get an idea of acceptable @@ -185,7 +184,8 @@ static bool check_results_mat_inv( for (uint32_t i = 0; i < n_values; ++i) { float32_t diff_abs = fabs(result[i] - expected[i]); - error = (expected[i] != 0) ? fabs(diff_abs / expected[i]) : fabs(result[i]); + float32_t error = + (expected[i] != 0) ? fabs(diff_abs / expected[i]) : fabs(result[i]); max_error = std::max(error, max_error); if (!std::isfinite(error) || !std::isfinite(diff_abs)) { @@ -332,19 +332,21 @@ inline void print_cmplx_mat(const std::string &ref, uint32_t m, } /* - * Return the number of floating-point operations required to calculate a length-n - * complex dot product + * Return the number of floating-point operations required to calculate a + * length-n complex dot product */ inline uint32_t cmplx_dot_nflops(uint32_t n) { - // A complex multiplication requires 6 floating-point operations - uint32_t op_mul = 6; - // A complex multiply-accumulate requires 8 floating-point operations - uint32_t op_mla = 8; - uint32_t nflops = 0; if (n > 0) { + // A complex multiplication requires 6 floating-point operations + uint32_t op_mul = 6; + + // A complex multiply-accumulate requires 8 floating-point operations + uint32_t op_mla = 8; + // The cost of multiplying the first two vector entries together nflops += op_mul; + // The cost of multiplying the remaining (n-1) vector entries // and accumulating into the dot product nflops += (n - 1) * op_mla; diff --git a/utils/qint64.hpp b/utils/qint64.hpp index f5f8756f1b6a79bb412ee0e48127ecb229a3e6fd..856899396dfccb445bc39115bbc85494813b9660 100644 --- a/utils/qint64.hpp +++ b/utils/qint64.hpp @@ -18,7 +18,7 @@ public: qint64_t() : val(0) {} /// Conversion from signed integer - qint64_t(int64_t val_in) : val(val_in) {} + explicit qint64_t(int64_t val_in) : val(val_in) {} /// Get the value as a 16b saturated signed integer. int16_t get16() const { @@ -36,12 +36,12 @@ public: /// Given a 128-bit integer, returns the saturated 64-bit value. static qint64_t saturate(__int128_t in) { if (in <= LONG_MIN) { - return LONG_MIN; + return qint64_t{LONG_MIN}; } if (in >= LONG_MAX) { - return LONG_MAX; + return qint64_t{LONG_MAX}; } - return (int64_t)in; + return static_cast(in); } }; @@ -146,7 +146,7 @@ void operator*=(qint64_t &l, R r) { */ template qint64_t operator>>(qint64_t l, R r) { - return l.get64() >> r; + return qint64_t{l.get64() >> r}; } /* diff --git a/utils/rng.hpp b/utils/rng.hpp index 358eb47a368b17e2ecf9d6b0353255cb2c215894..0e21bde9a039eb260dfaa387c9ead5f647911f39 100644 --- a/utils/rng.hpp +++ b/utils/rng.hpp @@ -89,7 +89,7 @@ struct random_state { * * @param[in] seed_in The seed to use. */ - random_state(uint64_t seed_in) : seed(seed_in) {} + explicit random_state(uint64_t seed_in) : seed(seed_in) {} /** * Create a `random_state` object from the specified seed values by mixing