diff --git a/CMakeLists.txt b/CMakeLists.txt index 33835c7a0bd8d0360fbd696590718a9ad369a06a..a1136538d119c69ec40fb37c5d697c9b3c210633 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -59,6 +59,19 @@ set(ARMRAL_LIB_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Correlation/arm_correlation.c + ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/DCT/dct_interface.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/DCT/dct_plan.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/DCT/direct_2_point_plan.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/DCT/direct_3_point_plan.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/DCT/direct_4_point_plan.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/DCT/direct_5_point_plan.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/DCT/direct_7_point_plan.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/DCT/direct_8_point_plan.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/DCT/direct_9_point_plan.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/DCT/radix_2_plan.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/DCT/radix_3_plan.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/DCT/radix_5_plan.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/DCT/radix_7_plan.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/bluestein.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c @@ -575,6 +588,7 @@ if(BUILD_TESTING) add_armral_test(turbo_rate_recovery test/UpperPHY/Turbo/Single/RateRecovery/main.cpp) add_armral_test(svd test/MatrixFactorizations/SVD/main.cpp) + add_armral_test(dct test/LowerPHY/DCT/main.cpp) add_armral_bench( matrix_inv_batch_general @@ -782,6 +796,7 @@ if(BUILD_TESTING) add_armral_bench(turbo_rate_recovery bench/UpperPHY/Turbo/Single/RateRecovery/main.cpp) add_armral_bench(svd bench/MatrixFactorizations/SVD/main.cpp) + add_armral_bench(dct_f16 bench/LowerPHY/DCT/F16/main.cpp) endif() if(BUILD_EXAMPLES) @@ -816,6 +831,7 @@ if(BUILD_EXAMPLES) add_armral_example(examples/fft_2d_cf32_example.c 4 5) add_armral_example(examples/modulation_example.c) add_armral_example(examples/polar_example.cpp 128 100 35) + add_armral_example(examples/dct_f16_interleaved_example.c 12 8) endif() if(BUILD_SIMULATION) diff --git a/bench/LowerPHY/DCT/F16/bench.py b/bench/LowerPHY/DCT/F16/bench.py new file mode 100755 index 0000000000000000000000000000000000000000..49f06a2c6e047a1eeb0e4747e65f63d22690779b --- /dev/null +++ b/bench/LowerPHY/DCT/F16/bench.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +# Arm RAN Acceleration Library +# SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause + +import json +from pathlib import Path +import os + + +def get_path(x): + return x if Path(x).is_file() else os.path.join("armral", x) + + +exe_name = get_path("bench_dct_f16") + +reps_base = 10000000 +howmanys = [1, 2, 8] +ns = [2, 3, 4, 5, 7, 9, 12, 20, 36, 72, 100, 4 * 49] + +j = {"exe_name": exe_name, "cases": []} + +for n in ns: + for howmany in howmanys: + if howmany == 1: + name = f"dct_f16_{n}_point_{howmany}" + else: + name = f"dct_f16_{n}_point_{howmany}_interleaved" + case = { + "name": name, + "args": f"{n} {howmany} {howmany} 1 {howmany} 1", + "reps": reps_base // n, + } + + j["cases"].append(case) + +print(json.dumps(j)) diff --git a/bench/LowerPHY/DCT/F16/main.cpp b/bench/LowerPHY/DCT/F16/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..95995019de489cdbde8378f1661259c83a71ad73 --- /dev/null +++ b/bench/LowerPHY/DCT/F16/main.cpp @@ -0,0 +1,65 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#include "armral.h" + +#include +#include +#include +#include + +namespace { + +void run_dct_f16_perf(uint32_t n, uint32_t howmany, uint32_t istride, + uint32_t idist, uint32_t ostride, uint32_t odist, + uint32_t num_reps) { + printf("[DCT f16] - n = %u, howmany = %u, number of iterations = " + "%u\n", + n, howmany, num_reps); + printf(" istride = %u, idist = %u, ostride = %u, odist = %u\n", + istride, idist, ostride, odist); + + const std::vector x(n * howmany); + std::vector y(n * howmany); + + armral_dct_plan_t *p = nullptr; + armral_dct_create_batch_plan_f16(&p, n, howmany, istride, idist, ostride, + odist); + assert(p); + for (uint32_t i = 0; i < num_reps; ++i) { + armral_dct_execute_f16(p, x.data(), y.data()); + } + + armral_dct_destroy_plan_f16(&p); +} + +} // anonymous namespace + +int main(int argc, char **argv) { + if (argc != 8) { + // n + // howmany + // istride + // idist + // ostride + // odist + // nreps - The number of times to repeat the function + fprintf(stderr, "usage: %s n howmany istride idist ostride odist nreps\n", + argv[0]); + exit(EXIT_FAILURE); + } + const auto n = (uint32_t)atoi(argv[1]); + const auto howmany = (uint32_t)(atoi(argv[2])); + const auto istride = (uint32_t)(atoi(argv[3])); + const auto idist = (uint32_t)(atoi(argv[4])); + const auto ostride = (uint32_t)(atoi(argv[5])); + const auto odist = (uint32_t)(atoi(argv[6])); + const auto nreps = (uint32_t)(atoi(argv[7])); + + run_dct_f16_perf(n, howmany, istride, idist, ostride, odist, nreps); + + return EXIT_SUCCESS; +} diff --git a/examples/dct_f16_interleaved_example.c b/examples/dct_f16_interleaved_example.c new file mode 100644 index 0000000000000000000000000000000000000000..d7da93dde248b2badd24489ca058cf59acef8c6e --- /dev/null +++ b/examples/dct_f16_interleaved_example.c @@ -0,0 +1,103 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#include "armral.h" + +#include +#include + +// This function shows how to create a plan and execute a DCT on interleaved +// data using the ArmRAL library +static void example_dct_plan_and_execute(int n, int howmany) { + armral_dct_plan_t *p; + printf("Planning DCT of length = %d, howmany = %d\n", n, howmany); + + // Layout is specifified by stride and dist parameters. Interleaved layout + // means: + // istride == ostride == howmany + // idist == odist == 1 + armral_dct_create_batch_plan_f16(&p, n, howmany, howmany, 1, howmany, 1); + + // Initialise input array, and allocate output array (transforms may be + // in-place, but this example does an out-of-place transform). + armral_half *x = (armral_half *)malloc(n * howmany * sizeof(armral_half)); + armral_half *y = (armral_half *)malloc(n * howmany * sizeof(armral_half)); + for (int i = 0; i < n; i++) { + for (int k = 0; k < howmany; k++) { + x[i * howmany + k] = (armral_half)((float)rand() / RAND_MAX); + } + } + + puts("Input data (interleaved transform operates on columns in the below " + "table):"); + + printf(" "); + for (int k = 0; k < howmany; k++) { + printf(" k=%-2d", k); + } + puts(""); + + for (int i = 0; i < n; i++) { + printf("i=%-2d ", i); + for (int k = 0; k < howmany; k++) { + printf("%.2f ", (float)x[i * howmany + k]); + } + puts(""); + } + + // Execute the batch of transforms. Input and output arrays need to have + // enough space allocated to support the specified layout. This is not checked + // by the library + printf("Performing batch of %d DCTs of length %d\n", howmany, n); + armral_dct_execute_f16(p, x, y); + + // A plan can be re-used to solve other DCTs, but once it is no longer needed + // it should to be destroyed to avoid leaking memory. + puts("Destroying plan"); + armral_dct_destroy_plan_f16(&p); + + puts("Output data:"); + + printf(" "); + for (int k = 0; k < howmany; k++) { + printf(" k=%-3d", k); + } + puts(""); + + for (int i = 0; i < n; i++) { + printf("i=%-2d ", i); + for (int k = 0; k < howmany; k++) { + printf("%5.2f ", (float)y[i * howmany + k]); + } + puts(""); + } + + // Free memory allocated for input and output. This is the responsibility of + // the user, not the plan. + free(x); + free(y); +} + +int main(int argc, char **argv) { + if (argc < 2) { + printf("Usage: %s n howmany\n", argv[0]); + exit(EXIT_FAILURE); + } + + int n = atoi(argv[1]); + if (n < 1) { + printf("Length parameter must be positive and non-zero\n"); + exit(EXIT_FAILURE); + } + + int howmany = atoi(argv[2]); + if (howmany < 1) { + printf("Batch size parameter must be positive and non-zero\n"); + exit(EXIT_FAILURE); + } + + example_dct_plan_and_execute(n, howmany); +} diff --git a/include/armral.h b/include/armral.h index 70d795fe65996f2e435f9fefb699d486ea0ede7e..f05fea3f7599932e6483e9709ec008b98ec78a45 100644 --- a/include/armral.h +++ b/include/armral.h @@ -108,6 +108,7 @@ typedef enum { ARMRAL_SUCCESS = 0, ///< No error. ARMRAL_ARGUMENT_ERROR = -1, ///< One or more arguments are incorrect. ARMRAL_FAIL = -2, ///< A failure has been detected. + ARMRAL_UNSUPPORTED = -3 ///< The requested operation is not supported. } armral_status; /** @@ -200,6 +201,12 @@ typedef struct { float32_t im; ///< 32-bit imaginary component. } armral_cmplx_f32_t; +#ifndef __clang__ +typedef float armral_half __attribute__((__mode__(HF))); +#else +typedef _Float16 armral_half; +#endif + /** * The number of complex samples in each compressed block. */ @@ -3381,6 +3388,82 @@ armral_status armral_fft_execute_cs16(const armral_fft_plan_t *p, */ armral_status armral_fft_destroy_plan_cs16(armral_fft_plan_t **p); +/** + * The opaque structure representing a DCT plan. DCT plan must be initialised + * with \ref armral_dct_create_batch_plan_f16. + */ +typedef struct armral_dct_plan_t armral_dct_plan_t; + +/** + * @brief Creates a DCT plan. + * + * Initialises a DCT plan and sets the out-parameter `p` to point to the created + * plan. It is efficient to create plans once and reuse them, rather than + * creating a plan for every execute call. + * + * Memory layout of the input and output arrays is specified by the stride and + * dist parameters. + * + * To avoid memory leaks, call \ref armral_dct_destroy_plan_f16 when you no + * longer need this plan. + * + * @param[in,out] p A pointer to the DCT plan pointer. + * @param[in] n The size of DCT to be computed by this plan. + * @param[in] howmany The number of transforms to compute. + * @param[in] istride The distance between adjacent elements of the input + * sequence to the transform. For example istride of 1 means + * that the transform is done on a contiguous block of + * memory. istride of 2 means that the input sequence is + * every other element of `in`. + * @param[in] idist The distance between the i-th element of each input + * sequence. For instance if `istride=1` and `idist=n`, then + * each input sequence is stored contiguously in memory with + * no gaps between. If `istride=howmany` and `idist=1` then + * the input sequences are stored with perfect interleaving. + * @param[in] ostride The distance in memory between adjacent elements of the + * output sequence. + * @param[in] odist The distance in memory between the i-th element of each + * output sequence. + * @return An `armral_status` value that indicates success or failure. + */ +armral_status armral_dct_create_batch_plan_f16(armral_dct_plan_t **p, int n, + int howmany, int istride, + int idist, int ostride, + int odist); + +/** + * @brief Executes a DCT plan. + * + * Uses the plan plan created by \ref armral_dct_create_batch_plan_f16 to + * perform the configured DCT with the specified arrays. + * + * @param[in] p The plan to execute. + * @param[in] in The input array. + * @param[out] out The output array. + * @return An `armral_status` value that indicates success or failure. + */ +armral_status armral_dct_execute_f16(const armral_dct_plan_t *p, + const armral_half *in, armral_half *out); + +/** + * @brief Destroys a DCT plan. + * + * Deallocates the plan and all resources used by it. The plan can no longer be + * used after destruction. On exit, `*p` is set to NULL. + * + * @param[in, out] p The plan to destroy. + * @return An `armral_status` value that indicates success or failure. + */ +armral_status armral_dct_destroy_plan_f16(armral_dct_plan_t **p); + +/** + * @brief Prints some debug information about the structure of a DCT plan. + * + * @param[in] p The plan to inspect. + * @return An `armral_status` value that indicates success or failure. + */ +armral_status armral_dct_print_plan_f16(armral_dct_plan_t *p); + /** @} end fft */ /** diff --git a/src/LowerPHY/DCT/dct_interface.cpp b/src/LowerPHY/DCT/dct_interface.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e6873e64a5ba8dca864a61612dfecf37d0811643 --- /dev/null +++ b/src/LowerPHY/DCT/dct_interface.cpp @@ -0,0 +1,47 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#include "armral.h" +#include "dct_plan.hpp" +#include +extern "C" { + +armral_status armral_dct_create_batch_plan_f16(armral_dct_plan_t **p, int n, + int howmany, int istride, + int idist, int ostride, + int odist) { + auto plan = + armral::dct::make_dct_plan(n, howmany, istride, idist, ostride, odist); + /* make_dct_plan returns unique_ptr. Null ptr means the parameter set is not + supported. User's dct plan handle has a raw pointer in - use release to + stop default destruction on return. */ + if (plan) { + *p = plan.release(); + return ARMRAL_SUCCESS; + } + return ARMRAL_UNSUPPORTED; +} + +armral_status armral_dct_execute_f16(const armral_dct_plan_t *p, + const armral_half *in, armral_half *out) { + return p->execute(in, out); +} + +armral_status armral_dct_destroy_plan_f16(armral_dct_plan_t **p) { + /* Plan was allocated using make_unique, but the user has a raw pointer not a + unique_ptr - have to call destructor followed by free manually in order to + deallocate. */ + (*p)->~armral_dct_plan_t(); + free(*p); + *p = NULL; + return ARMRAL_SUCCESS; +} + +armral_status armral_dct_print_plan_f16(armral_dct_plan_t *p) { + p->print_plan(); + return ARMRAL_SUCCESS; +} +} diff --git a/src/LowerPHY/DCT/dct_plan.cpp b/src/LowerPHY/DCT/dct_plan.cpp new file mode 100644 index 0000000000000000000000000000000000000000..123afeea6156ff2e8cd0443af9ca227f1716c047 --- /dev/null +++ b/src/LowerPHY/DCT/dct_plan.cpp @@ -0,0 +1,89 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#include "dct_plan.hpp" +#include "direct_plan.hpp" +#include "odd_radix_plan.hpp" +#include "radix_2_plan.hpp" + +namespace { +bool is_supported_by_mixed_radix_plan(int n) { + switch (n) { + case 1: + return false; + case 2: + case 3: + case 5: + case 7: + return true; + default: + if (n % 2 == 0) { + return is_supported_by_mixed_radix_plan(n / 2); + } + if (n % 3 == 0) { + return is_supported_by_mixed_radix_plan(n / 3); + } + if (n % 5 == 0) { + return is_supported_by_mixed_radix_plan(n / 5); + } + if (n % 7 == 0) { + return is_supported_by_mixed_radix_plan(n / 7); + } + } + return false; +} +} // namespace + +namespace armral::dct { + +utils::unique_ptr make_mixed_radix_plan(int n, int howmany) { + switch (n) { + case 2: + return create_direct_plan<2>(howmany); + case 3: + return create_direct_plan<3>(howmany); + case 4: + return create_direct_plan<4>(howmany); + case 5: + return create_direct_plan<5>(howmany); + case 7: + return create_direct_plan<7>(howmany); + case 8: + return create_direct_plan<8>(howmany); + case 9: + return create_direct_plan<9>(howmany); + default: + if (n % 2 == 0) { + return create_radix_2_plan(n, howmany); + } + if (n % 3 == 0) { + return create_odd_radix_plan<3>(n, howmany); + } + if (n % 5 == 0) { + return create_odd_radix_plan<5>(n, howmany); + } + if (n % 7 == 0) { + return create_odd_radix_plan<7>(n, howmany); + } + // Unsupported configuration + __builtin_unreachable(); + } +} + +utils::unique_ptr make_dct_plan(int n, int howmany, + int istride, int idist, + int ostride, int odist) { + /* Optimised path only supports interleaved format. */ + if (istride == howmany && ostride == howmany && odist == 1 && idist == 1 && + is_supported_by_mixed_radix_plan(n)) { + return make_mixed_radix_plan(n, howmany); + } + + /* No other configuration is currently supported. */ + return utils::unique_ptr(nullptr); +} + +} // namespace armral::dct diff --git a/src/LowerPHY/DCT/dct_plan.hpp b/src/LowerPHY/DCT/dct_plan.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e5adc6acc6987abadf6e307e30d68cd063fbee26 --- /dev/null +++ b/src/LowerPHY/DCT/dct_plan.hpp @@ -0,0 +1,50 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#pragma once + +#include "armral.h" +#include "unique_ptr.hpp" + +#include + +struct armral_dct_plan_t { +public: + virtual ~armral_dct_plan_t() = default; + armral_dct_plan_t() = default; + armral_dct_plan_t(const armral_dct_plan_t &) = delete; + armral_dct_plan_t &operator=(const armral_dct_plan_t &) = delete; + + /* Cannot declare these pure virtual as this introduces dependency on + libstdc++ - instead define them as stubs. */ + virtual armral_status execute(const armral_half * /*unused*/, + armral_half * /*unused*/) const { + assert(false); + __builtin_unreachable(); + }; + + virtual void print_plan() const { + assert(false); + }; + + /* Virtual destructor may result in calls to operator delete (note the calls + are not in the destructor, but emitted in case delete is ever called on + armral_dct_plan_t. Define operator delete here to avoid dependency on + libstdc++, but don't even think about using it! Use utils::make_unique for + dynamically allocated DCT plan. */ + void operator delete(void * /*unused*/) { + assert(false); + }; +}; + +namespace armral::dct { + +utils::unique_ptr make_mixed_radix_plan(int n, int howmany); + +utils::unique_ptr make_dct_plan(int n, int howmany, + int istride, int idist, + int ostride, int odist); +} // namespace armral::dct diff --git a/src/LowerPHY/DCT/direct_2_point_plan.cpp b/src/LowerPHY/DCT/direct_2_point_plan.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ee0c95e90b147f95add8b26c2a8fa241a44b0296 --- /dev/null +++ b/src/LowerPHY/DCT/direct_2_point_plan.cpp @@ -0,0 +1,33 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#include "direct_plan.hpp" +#include "dynamic_value.hpp" +#include "pragmas.hpp" + +namespace armral::dct { + +template +struct direct_plan_worker<2, Howmany> { + static armral_status execute(const armral_half *in, armral_half *out, + Howmany howmany) { + constexpr auto rt2 = static_cast(0x1.6a09e667f3bcdp0); + ARMRAL_PRAGMA_AUTOVEC + for (int k = 0; k < howmany.v; k++) { + const auto x0 = in[k]; + const auto x1 = in[howmany.v + k]; + out[k] = 2 * (x0 + x1); + out[howmany.v + k] = rt2 * (x0 - x1); + } + return ARMRAL_SUCCESS; + }; +}; + +template<> +utils::unique_ptr create_direct_plan<2>(int howmany) { + return utils::make_unique>(howmany); +} +} // namespace armral::dct diff --git a/src/LowerPHY/DCT/direct_3_point_plan.cpp b/src/LowerPHY/DCT/direct_3_point_plan.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1fa32aa010ac07ecd06c0eec8bba087bff229f23 --- /dev/null +++ b/src/LowerPHY/DCT/direct_3_point_plan.cpp @@ -0,0 +1,41 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#include "direct_plan.hpp" +#include "dynamic_value.hpp" +#include "pragmas.hpp" + +namespace armral::dct { + +template +struct direct_plan_worker<3, Howmany> { + static armral_status execute(const armral_half *in, + armral_half *out, // NOLINT (LLVM bug 41393) + Howmany howmany) { + // 2 * cos(pi / 6) + constexpr auto a = static_cast(0x1.bb67ae8584caap0); + + ARMRAL_PRAGMA_AUTOVEC + for (int k = 0; k < howmany.v; k++) { + const auto x0 = in[howmany.v * 0 + k]; + const auto x1 = in[howmany.v * 1 + k] * 2; + const auto x2 = in[howmany.v * 2 + k]; + const auto p02 = x0 + x2; + out[howmany.v * 0 + k] = p02 * 2 + x1; + out[howmany.v * 1 + k] = (x0 - x2) * a; + out[howmany.v * 2 + k] = p02 - x1; + } + + return ARMRAL_SUCCESS; + }; +}; + +template<> +utils::unique_ptr create_direct_plan<3>(int howmany) { + return utils::make_unique>(howmany); +} + +} // namespace armral::dct diff --git a/src/LowerPHY/DCT/direct_4_point_plan.cpp b/src/LowerPHY/DCT/direct_4_point_plan.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2c18d38d745876ecc2f76185671d3b59756eaaaf --- /dev/null +++ b/src/LowerPHY/DCT/direct_4_point_plan.cpp @@ -0,0 +1,57 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#include "direct_plan.hpp" +#include "dynamic_value.hpp" +#include "pragmas.hpp" + +namespace armral::dct { + +template +struct direct_plan_worker<4, Howmany> { + static armral_status execute(const armral_half *in, + armral_half *out, // NOLINT (LLVM bug 41393) + Howmany howmany) { + /* 4-point DCT using Loeffler's algorithm: + C. Loeffler, A. Ligtenberg and G. S. Moschytz, "Practical fast 1-D DCT + algorithms with 11 multiplications," International Conference on + Acoustics, Speech, and Signal Processing,, Glasgow, UK, 1989, pp. 988-991 + vol.2. */ + + constexpr auto rt2 = static_cast(0x1.6a09e667f3bcdp0); + // 2 * cos(1 * pi / 8) + constexpr auto a = static_cast(0x1.d906bcf328d46p0); + // 2 * cos(3 * pi / 8) + constexpr auto b = static_cast(0x1.87de2a6aea963p-1); + // 2 * cos(9 * pi / 8) + constexpr auto c = static_cast(-0x1.d906bcf328d46p0); + + ARMRAL_PRAGMA_AUTOVEC + for (int k = 0; k < howmany.v; k++) { + const auto x0 = in[howmany.v * 0 + k]; + const auto x1 = in[howmany.v * 1 + k]; + const auto x2 = in[howmany.v * 2 + k]; + const auto x3 = in[howmany.v * 3 + k]; + const auto y0 = x0 + x3; + const auto y1 = x1 + x2; + const auto y2 = x0 - x3; + const auto y3 = x1 - x2; + out[howmany.v * 0 + k] = 2 * (y0 + y1); + out[howmany.v * 1 + k] = a * y2 + b * y3; + out[howmany.v * 2 + k] = rt2 * (y0 - y1); + out[howmany.v * 3 + k] = b * y2 + c * y3; + } + + return ARMRAL_SUCCESS; + }; +}; + +template<> +utils::unique_ptr create_direct_plan<4>(int howmany) { + return utils::make_unique>(howmany); +} + +} // namespace armral::dct diff --git a/src/LowerPHY/DCT/direct_5_point_plan.cpp b/src/LowerPHY/DCT/direct_5_point_plan.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f2e45a1990674b369bc063f386d8d38236550c24 --- /dev/null +++ b/src/LowerPHY/DCT/direct_5_point_plan.cpp @@ -0,0 +1,70 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#include "direct_plan.hpp" +#include "dynamic_value.hpp" +#include "pragmas.hpp" + +namespace armral::dct { + +template +struct direct_plan_worker<5, Howmany> { + static armral_status execute(const armral_half *in, + armral_half *out, // NOLINT (LLVM bug 41393) + Howmany howmany) { + /* 5-point algorithm from appendix B of: + G. Bi and L. W. Yu, "DCT algorithms for composite sequence lengths," in + IEEE Transactions on Signal Processing, vol. 46, no. 3, pp. 554-562, + March 1998. */ + + // (cos(pi / 5) - cos(3 * pi / 5)) / 2 + constexpr auto a = static_cast(0x1.1e3779b97f4a8p-1); + // cos(pi / 10) + constexpr auto b = static_cast(0x1.e6f0e134454ffp-1); + // cos(pi / 10) - cos(3 * pi / 10) + constexpr auto c = static_cast(0x1.73fd61d9df543p-2); + // cos(pi / 10) + cos(3 * pi / 10) + constexpr auto d = static_cast(0x1.89f188bdcd7afp0); + + ARMRAL_PRAGMA_AUTOVEC + for (int k = 0; k < howmany.v; k++) { + const auto x0 = in[howmany.v * 0 + k]; + const auto x1 = in[howmany.v * 1 + k]; + const auto x2 = in[howmany.v * 2 + k]; + const auto x3 = in[howmany.v * 3 + k]; + const auto x4 = in[howmany.v * 4 + k]; + + const auto u0 = x0 + x4; + const auto u1 = x1 + x3; + const auto v0 = x0 - x4; + const auto v1 = x1 - x3; + auto t1 = u0 + u1; + auto t2 = u0 - u1; + auto t3 = (t1 * static_cast(0.25)) - x2; + const auto t4 = t2 * a; + + out[howmany.v * 0 + k] = (t1 + x2) * 2; + out[howmany.v * 2 + k] = (t3 + t4) * 2; + out[howmany.v * 4 + k] = (t4 - t3) * 2; + + t1 = (v0 + v1) * b; + t2 = v1 * c; + t3 = v0 * d; + + out[howmany.v * 1 + k] = (t1 - t2) * 2; + out[howmany.v * 3 + k] = (t3 - t1) * 2; + } + + return ARMRAL_SUCCESS; + }; +}; + +template<> +utils::unique_ptr create_direct_plan<5>(int howmany) { + return utils::make_unique>(howmany); +} + +} // namespace armral::dct diff --git a/src/LowerPHY/DCT/direct_7_point_plan.cpp b/src/LowerPHY/DCT/direct_7_point_plan.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d04661ed0eea3e9363a76a14d498488bd3206c16 --- /dev/null +++ b/src/LowerPHY/DCT/direct_7_point_plan.cpp @@ -0,0 +1,61 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#include "direct_plan.hpp" +#include "dynamic_value.hpp" +#include "pragmas.hpp" + +namespace armral::dct { + +template +struct direct_plan_worker<7, Howmany> { + static armral_status execute(const armral_half *in, + armral_half *out, // NOLINT (LLVM bug 41393) + Howmany howmany) { + // ci = 2 * cos(i * pi / 14) + constexpr auto c1 = static_cast(0x1.f329c0558e969p0); + constexpr auto c2 = static_cast(0x1.cd4bca9cb5c71p0); + constexpr auto c3 = static_cast(0x1.904c37505de4bp0); + constexpr auto c4 = static_cast(0x1.3f3a0e28bedd1p0); + constexpr auto c5 = static_cast(0x1.bc4c04d71abc1p-1); + constexpr auto c6 = static_cast(0x1.c7b90e3024582p-2); + + ARMRAL_PRAGMA_AUTOVEC + for (int k = 0; k < howmany.v; k++) { + const auto x0 = in[0 * howmany.v + k]; + const auto x1 = in[1 * howmany.v + k]; + const auto x2 = in[2 * howmany.v + k]; + const auto x3 = in[3 * howmany.v + k] * 2; + const auto x4 = in[4 * howmany.v + k]; + const auto x5 = in[5 * howmany.v + k]; + const auto x6 = in[6 * howmany.v + k]; + + const auto p06 = x0 + x6; + const auto p15 = x1 + x5; + const auto p24 = x2 + x4; + const auto s06 = x0 - x6; + const auto s15 = x1 - x5; + const auto s24 = x2 - x4; + + out[0 * howmany.v + k] = 2 * (p06 + p15 + p24) + x3; + out[1 * howmany.v + k] = c1 * s06 + c3 * s15 + c5 * s24; + out[2 * howmany.v + k] = c2 * p06 - c4 * p24 + c6 * p15 - x3; + out[3 * howmany.v + k] = c3 * s06 - c1 * s24 - c5 * s15; + out[4 * howmany.v + k] = c4 * p06 - c2 * p15 - c6 * p24 + x3; + out[5 * howmany.v + k] = c3 * s24 + c5 * s06 - c1 * s15; + out[6 * howmany.v + k] = c2 * p24 - c4 * p15 + c6 * p06 - x3; + } + + return ARMRAL_SUCCESS; + }; +}; + +template<> +utils::unique_ptr create_direct_plan<7>(int howmany) { + return utils::make_unique>(howmany); +} + +} // namespace armral::dct diff --git a/src/LowerPHY/DCT/direct_8_point_plan.cpp b/src/LowerPHY/DCT/direct_8_point_plan.cpp new file mode 100644 index 0000000000000000000000000000000000000000..964f3ecc9dfae0b37fdb2fb69f7bb2c3f18d2a1b --- /dev/null +++ b/src/LowerPHY/DCT/direct_8_point_plan.cpp @@ -0,0 +1,109 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#include "direct_plan.hpp" +#include "dynamic_value.hpp" +#include "pragmas.hpp" +#include "static_value.hpp" + +namespace armral::dct { + +template +struct direct_plan_worker<8, Howmany> { + static armral_status execute(const armral_half *in, + armral_half *out, // NOLINT (LLVM bug 41393) + Howmany howmany) { + /* 8-point DCT using Loeffler's algorithm: + C. Loeffler, A. Ligtenberg and G. S. Moschytz, "Practical fast 1-D DCT + algorithms with 11 multiplications," International Conference on + Acoustics, Speech, and Signal Processing,, Glasgow, UK, 1989, pp. 988-991 + vol.2. */ + + // 2 * sin(1 * pi / 16) + constexpr auto s1 = static_cast(0x1.8f8b83c69a60bp-2); + // 2 * cos(1 * pi / 16) + constexpr auto c1 = static_cast(0x1.f6297cff75cbp0); + // 2 * sin(3 * pi / 16) + constexpr auto s3 = static_cast(0x1.1c73b39ae68c8p0); + // 2 * cos(3 * pi / 16) + constexpr auto c3 = static_cast(0x1.a9b66290ea1a3p0); + // 2 * sin(6 * pi / 16) + constexpr auto s6 = static_cast(0x1.d906bcf328d46p0); + // 2 * cos(6 * pi / 16) + constexpr auto c6 = static_cast(0x1.87de2a6aea963p-1); + // sqrt(2) + constexpr auto rt2 = static_cast(0x1.6a09e667f3bcdp0); + // 1 / sqrt(2) + constexpr auto irt2 = static_cast(0x1.6a09e667f3bcdp-1); + + ARMRAL_PRAGMA_AUTOVEC + for (int k = 0; k < howmany.v; k++) { + const auto x_0 = in[howmany.v * 0 + k]; + const auto x_1 = in[howmany.v * 1 + k]; + const auto x_2 = in[howmany.v * 2 + k]; + const auto x_3 = in[howmany.v * 3 + k]; + const auto x_4 = in[howmany.v * 4 + k]; + const auto x_5 = in[howmany.v * 5 + k]; + const auto x_6 = in[howmany.v * 6 + k]; + const auto x_7 = in[howmany.v * 7 + k]; + + // Stage 1 + const auto x1_0 = x_0 + x_7; + const auto x1_1 = x_1 + x_6; + const auto x1_2 = x_2 + x_5; + const auto x1_3 = x_3 + x_4; + const auto x1_4 = x_3 - x_4; + const auto x1_5 = x_2 - x_5; + const auto x1_6 = x_1 - x_6; + const auto x1_7 = x_0 - x_7; + + // Stage 2 + const auto x2_0 = x1_0 + x1_3; + const auto x2_1 = x1_1 + x1_2; + const auto x2_2 = x1_1 - x1_2; + const auto x2_3 = x1_0 - x1_3; + const auto x2_4 = c3 * x1_4 + s3 * x1_7; + const auto x2_5 = c1 * x1_5 + s1 * x1_6; + const auto x2_6 = c1 * x1_6 - s1 * x1_5; + const auto x2_7 = c3 * x1_7 - s3 * x1_4; + + // Stage 3 + const auto x3_0 = x2_0 + x2_1; + const auto x3_1 = x2_0 - x2_1; + const auto x3_2 = c6 * x2_2 + s6 * x2_3; + const auto x3_3 = c6 * x2_3 - s6 * x2_2; + const auto x3_4 = (x2_4 + x2_6) * irt2; + const auto x3_5 = x2_7 - x2_5; + const auto x3_6 = x2_4 - x2_6; + const auto x3_7 = (x2_7 + x2_5) * irt2; + + out[k + howmany.v * 0] = x3_0 * 2; + out[k + howmany.v * 2] = x3_2; + out[k + howmany.v * 4] = x3_1 * rt2; + out[k + howmany.v * 6] = x3_3; + + // Stage 4 + out[k + howmany.v * 1] = x3_7 + x3_4; + out[k + howmany.v * 3] = x3_5; + out[k + howmany.v * 5] = x3_6; + out[k + howmany.v * 7] = x3_7 - x3_4; + } + + return ARMRAL_SUCCESS; + }; +}; + +template<> +utils::unique_ptr create_direct_plan<8>(int howmany) { + switch (howmany) { + case 1: + return utils::make_unique>>(howmany); + default: + return utils::make_unique>(howmany); + } +} + +} // namespace armral::dct diff --git a/src/LowerPHY/DCT/direct_9_point_plan.cpp b/src/LowerPHY/DCT/direct_9_point_plan.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c8d3f2907f70ae9f154601c200e4214f0673589a --- /dev/null +++ b/src/LowerPHY/DCT/direct_9_point_plan.cpp @@ -0,0 +1,98 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#include "direct_plan.hpp" +#include "dynamic_value.hpp" +#include "pragmas.hpp" + +namespace armral::dct { + +template +struct direct_plan_worker<9, Howmany> { + static armral_status execute(const armral_half *in, + armral_half *out, // NOLINT (LLVM bug 41393) + Howmany howmany) { + /* 5-point algorithm from appendix B of: + G. Bi and L. W. Yu, "DCT algorithms for composite sequence lengths," in + IEEE Transactions on Signal Processing, vol. 46, no. 3, pp. 554-562, + March 1998, with some modification for the extra factor of 2 in the + output. */ + + // 2 * sin(1 * pi / 9) + constexpr auto s1 = static_cast(0x1.5e3a8748a0bf5p-1); + // 2 * sin(2 * pi / 9) + constexpr auto s2 = static_cast(0x1.491b7523c161dp0); + // 2 * sin(3 * pi /9) + constexpr auto s3 = static_cast(0x1.bb67ae8584caap0); + // 2 * cos(1 * pi / 9) + constexpr auto c1 = static_cast(0x1.e11f642522d1cp0); + // 2 * cos(2 * pi / 9) + constexpr auto c2 = static_cast(0x1.8836fa2cf5039p0); + // 2 * cos(0.5 * pi / 9) + constexpr auto ch = static_cast(0x1.f838b8c811c17p0); + // 2 * sin(0.5 * pi / 9) + constexpr auto sh = static_cast(0x1.63a1a7e0b738ap-2); + + ARMRAL_PRAGMA_AUTOVEC + for (int k = 0; k < howmany.v; k++) { + const auto x0 = in[howmany.v * 0 + k]; + const auto x1 = in[howmany.v * 1 + k]; + const auto x2 = in[howmany.v * 2 + k]; + const auto x3 = in[howmany.v * 3 + k]; + const auto x4 = in[howmany.v * 4 + k] * 2; + const auto x5 = in[howmany.v * 5 + k]; + const auto x6 = in[howmany.v * 6 + k]; + const auto x7 = in[howmany.v * 7 + k]; + const auto x8 = in[howmany.v * 8 + k]; + + const auto u0 = x0 + x8; + const auto u1 = x1 + x7; + const auto u2 = x2 + x6; + const auto u3 = x3 + x5; + const auto v0 = x0 - x8; + const auto v1 = x1 - x7; + const auto v2 = x2 - x6; + const auto v3 = x3 - x5; + + const auto t0 = u0 + u2 + u3; + const auto t2 = u0 - u2; + const auto t3 = u2 - u3; + const auto t1 = (t2 + t3) * c2; + const auto t4 = u1 - x4; + + const auto t6 = t2 * sh; + const auto t7 = t3 * c1; + const auto t8 = 2 * u1 + x4; + const auto t9 = t1 + t6; + const auto t10 = t1 - t7; + + const auto t11 = v1 * s3; + const auto t12 = (v0 + v3) * ch; + const auto t13 = (v2 - v3) * s2; + const auto t14 = (v0 + v2) * s1; + const auto t15 = t12 + t13; + const auto t16 = t12 - t14; + + out[k + howmany.v * 0] = 2 * t0 + t8; + out[k + howmany.v * 1] = t15 + t11; + out[k + howmany.v * 2] = t9 + t4; + out[k + howmany.v * 3] = (v0 - v2 - v3) * s3; + out[k + howmany.v * 4] = t10 - t4; + out[k + howmany.v * 5] = t16 - t11; + out[k + howmany.v * 6] = t0 - t8; + out[k + howmany.v * 7] = t15 - t16 - t11; + out[k + howmany.v * 8] = t9 - t10 - t4; + } + + return ARMRAL_SUCCESS; + }; +}; + +template<> +utils::unique_ptr create_direct_plan<9>(int howmany) { + return utils::make_unique>(howmany); +} +} // namespace armral::dct diff --git a/src/LowerPHY/DCT/direct_plan.hpp b/src/LowerPHY/DCT/direct_plan.hpp new file mode 100644 index 0000000000000000000000000000000000000000..690c41eee886f023dc356b8e07fc4122ec7bdd74 --- /dev/null +++ b/src/LowerPHY/DCT/direct_plan.hpp @@ -0,0 +1,38 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#pragma once + +#include "dct_plan.hpp" + +#include + +namespace armral::dct { + +template +struct direct_plan_worker; + +template +class direct_plan : public armral_dct_plan_t { +public: + direct_plan(int howmany) : m_howmany(howmany) {} + + armral_status execute(const armral_half *in, armral_half *out) const final { + return direct_plan_worker::execute(in, out, m_howmany); + }; + + void print_plan() const final { + printf("direct_%d_point_plan(howmany=%d)\n", N, m_howmany.v); + } + +private: + const Howmany m_howmany; +}; + +template +utils::unique_ptr create_direct_plan(int howmany); + +} // namespace armral::dct diff --git a/src/LowerPHY/DCT/dynamic_value.hpp b/src/LowerPHY/DCT/dynamic_value.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f7794f97406f9696d0b9b428ad253aa741b834ca --- /dev/null +++ b/src/LowerPHY/DCT/dynamic_value.hpp @@ -0,0 +1,18 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#pragma once + +class dynamic_value { + /* Utility class to allow DCT stages to provide specialisations for certain + values of some parameter (via the partner class static_value) while still + allowing arbitrary values to be chosen at runtime. */ +public: + // NOLINTNEXTLINE(readability-identifier-naming) + dynamic_value(int v_) : v(v_) {} + + const int v; +}; diff --git a/src/LowerPHY/DCT/odd_radix_plan.hpp b/src/LowerPHY/DCT/odd_radix_plan.hpp new file mode 100644 index 0000000000000000000000000000000000000000..406d0b1e597f4d6abfbe7b53f44a5ef0de38cc39 --- /dev/null +++ b/src/LowerPHY/DCT/odd_radix_plan.hpp @@ -0,0 +1,91 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#pragma once + +#include "dct_plan.hpp" +#include "pod_vector.hpp" +#include "unique_ptr.hpp" + +#include +#include + +namespace armral::dct { + +template +struct odd_radix_worker; + +template +class odd_radix_plan : public armral_dct_plan_t { +public: + odd_radix_plan(int n, int howmany) + : m_howmany(howmany), m_n(n), m_d(n / Q), m_twid_t(n_cs * m_d), + m_twid_a(n_cs * n), m_plan(make_mixed_radix_plan(n / Q, Q * howmany)) { + /* Initialise twiddle factors as described in equations 9 and 10: + G. Bi and L. W. Yu, "DCT algorithms for composite sequence lengths," in + IEEE Transactions on Signal Processing, vol. 46, no. 3, pp. 554-562, + March 1998. + + m_twid_t implements the tan-based factor. Even elements of m_twid_a + implement the cos-based factor in eqn. 9, odd elements implement the + sin-based factor in eqn. 10. */ + utils::pod_vector tmp(m_twid_t.size()); + for (int m = 0; m < n_cs; m++) { + for (int k = 0; k < m_d; k++) { + const double twid_idx = M_PI * k * (Q - 1 - 2 * m) / (2 * n); + m_twid_t[m * m_d + k] = static_cast(std::tan(twid_idx)); + tmp[m * m_d + k] = std::cos(twid_idx); + } + } + + for (int j = 0; j < Q; j++) { + for (int m = 0; m < n_cs; m++) { + const double twid_idx = M_PI * (Q - 1 - 2 * m) * j / (2 * Q); + for (int k = 0; k < m_d; k++) { + if (j % 2 == 0) { + m_twid_a[m_d * (j * n_cs + m) + k] = static_cast( + tmp[m * m_d + k] * std::cos(twid_idx) * std::pow(-1.0, j / 2)); + } else { + m_twid_a[m_d * (j * n_cs + m) + k] = + static_cast(tmp[m * m_d + k] * std::sin(twid_idx) * + std::pow(-1.0, (j - 1) / 2)); + } + } + } + } + } + + void print_plan() const final { + printf("radix_%d_plan(n=%d, howmany=%d)\n", Q, m_n, m_howmany.v); + m_plan->print_plan(); + } + + armral_status execute(const armral_half *in, armral_half *out) const final { + utils::pod_vector buf(m_n * m_howmany.v); + odd_radix_worker::prepare_sub_transform(in, buf, m_d, + m_howmany); + m_plan->execute(buf.data(), buf.data()); + odd_radix_worker::extract_sub_transform( + out, buf, m_d, m_howmany, m_twid_t, m_twid_a); + return ARMRAL_SUCCESS; + } + +private: + // Highest index of S_n or C_n sub-transform + static constexpr int n_cs = (Q - 1) / 2; + const Howmany m_howmany; + const int m_n; + // N of the sub-transform + const int m_d; + utils::pod_vector m_twid_t; + utils::pod_vector m_twid_a; + utils::unique_ptr m_plan; +}; + +template +utils::unique_ptr create_odd_radix_plan(int n, int howmany); + +} // namespace armral::dct diff --git a/src/LowerPHY/DCT/radix_2_plan.cpp b/src/LowerPHY/DCT/radix_2_plan.cpp new file mode 100644 index 0000000000000000000000000000000000000000..46a05b1f9df19e1bf0988b559877622f6742153e --- /dev/null +++ b/src/LowerPHY/DCT/radix_2_plan.cpp @@ -0,0 +1,77 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#include "radix_2_plan.hpp" +#include "dct_plan.hpp" +#include "dynamic_value.hpp" +#include "pragmas.hpp" +#include "static_value.hpp" + +#include + +namespace armral::dct { + +template +radix_2_plan::radix_2_plan(int n, int howmany) + : m_n(n), m_howmany(howmany), m_twid(n / 2), m_buf(n * howmany), + // Use Tw as storage and working precision for the sub-plan + m_plan(make_mixed_radix_plan(n / 2, howmany * 2)) { + for (int i = 0; i < n / 2; i++) { + double m = i * 2 + 1; + m_twid[i] = static_cast(0.5 / std::cos(M_PI * m / (2 * n))); + } +} + +template +armral_status radix_2_plan::execute(const armral_half *in, + armral_half *out) const { + /* Radix-2 decomposition as described in: + M. Perera, S., Silverio, D., Ogle, A. (2019). Efficient Split-Radix and + Radix-4 DCT Algorithms and Applications. Analysis of Experimental + Algorithms. SEA 2019. Lecture Notes in Computer Science(), vol 11544. + Springer, Cham. */ + + for (int i = 0; i < m_n / 2; i++) { + ARMRAL_PRAGMA_AUTOVEC + for (int k = 0; k < m_howmany.v; k++) { + const auto a = in[k + m_howmany.v * i]; + const auto b = in[k + m_howmany.v * (m_n - i - 1)]; + m_buf[k + m_howmany.v * (2 * i)] = a + b; + m_buf[k + m_howmany.v * (2 * i + 1)] = (a - b) * m_twid[i]; + } + } + + m_plan->execute(m_buf.data(), out); + + for (int i = 1; i < m_n - 1; i += 2) { + ARMRAL_PRAGMA_AUTOVEC + for (int k = 0; k < m_howmany.v; k++) { + out[m_howmany.v * i + k] += out[m_howmany.v * (i + 2) + k]; + } + } + return ARMRAL_SUCCESS; +} + +template +void radix_2_plan::print_plan() const { + printf("radix_2_plan(n=%d, howmany=%d)\n", m_n, m_howmany.v); + m_plan->print_plan(); +} + +utils::unique_ptr create_radix_2_plan(int n, int howmany) { + switch (howmany) { + case 1: + return utils::make_unique>>(n, howmany); + case 2: + return utils::make_unique>>(n, howmany); + case 3: + return utils::make_unique>>(n, howmany); + default: + return utils::make_unique>(n, howmany); + } +} + +} // namespace armral::dct diff --git a/src/LowerPHY/DCT/radix_2_plan.hpp b/src/LowerPHY/DCT/radix_2_plan.hpp new file mode 100644 index 0000000000000000000000000000000000000000..3b858050da674b42b8bd8a73858fb79251709a24 --- /dev/null +++ b/src/LowerPHY/DCT/radix_2_plan.hpp @@ -0,0 +1,33 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#pragma once + +#include "dct_plan.hpp" +#include "pod_vector.hpp" +#include "unique_ptr.hpp" + +namespace armral::dct { + +template +class radix_2_plan : public armral_dct_plan_t { +public: + radix_2_plan(int n, int howmany); + + armral_status execute(const armral_half *in, armral_half *out) const final; + void print_plan() const override; + +private: + const int m_n; + const Howmany m_howmany; + utils::pod_vector m_twid; + utils::pod_vector m_buf; + utils::unique_ptr m_plan; +}; + +utils::unique_ptr create_radix_2_plan(int n, int howmany); + +} // namespace armral::dct diff --git a/src/LowerPHY/DCT/radix_3_plan.cpp b/src/LowerPHY/DCT/radix_3_plan.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fa14bc4c85a74ab7c9297922e1a405aedb73bb79 --- /dev/null +++ b/src/LowerPHY/DCT/radix_3_plan.cpp @@ -0,0 +1,113 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#include "dynamic_value.hpp" +#include "odd_radix_plan.hpp" +#include "pragmas.hpp" +#include "static_value.hpp" + +namespace armral::dct { + +template +struct odd_radix_worker<3, Howmany> { + static void prepare_sub_transform(const armral_half *in, + const utils::pod_vector &buf, + int d, Howmany howmany) { + // Prepare the inputs for transforms A, C_0 and S_0 (the latter two are just + // referred to as c and s throughout + for (int i = 0; i < d; i++) { + ARMRAL_PRAGMA_AUTOVEC + for (int k = 0; k < howmany.v; k++) { + // x and y are inputs to the butterfly forming input sequences for c and + // s + const auto x = in[(3 * i + 0) * howmany.v + k]; + const auto y = in[(3 * i + 2) * howmany.v + k]; + const auto c_in = x + y; + auto s_in = x - y; + // (-1)^n term of eqn. 6b + if (i % 2 == 1) { + s_in = -s_in; + } + + const auto a_in = in[(3 * i + 1) * howmany.v + k]; + + buf[(3 * i + 0) * howmany.v + k] = a_in; + buf[(3 * i + 1) * howmany.v + k] = c_in; + buf[(3 * i + 2) * howmany.v + k] = s_in; + } + } + }; + + static void + extract_sub_transform(armral_half *out, // NOLINT (LLVM bug 41393) + const utils::pod_vector &buf, int d, + Howmany howmany, + const utils::pod_vector &twid_t, + const utils::pod_vector &twid_a) { + // First element of each subsequence F_n (n = 0 to 2) requires minimal + // post-processing. In addition some twiddle factors can be elided as they + // are 1 or 0. */ + ARMRAL_PRAGMA_AUTOVEC + for (int k = 0; k < howmany.v; k++) { + const auto a = buf[0 * howmany.v + k]; + const auto c = buf[1 * howmany.v + k]; + const auto s = buf[2 * howmany.v + k]; + out[k + 0 * d * howmany.v] = c + a; + out[k + 1 * d * howmany.v] = s * twid_a[d]; + out[k + 2 * d * howmany.v] = c * static_cast(0.5) - a; + } + + // Unrolled version of the loop over n_cs in equations 9 and 10 + for (int i = 1; i < d; i++) { + ARMRAL_PRAGMA_AUTOVEC + for (int k = 0; k < howmany.v; k++) { + // c and s are combined in opposite directions - loop over both + // in both directions to fully unroll the sum + const auto c_fwd = buf[(i * 3 + 1) * howmany.v + k]; + const auto s_fwd = buf[(i * 3 + 2) * howmany.v + k]; + const auto c_bck = buf[((d - i) * 3 + 1) * howmany.v + k]; + const auto s_bck = buf[((d - i) * 3 + 2) * howmany.v + k]; + const auto a = buf[i * howmany.v * 3 + k]; + + out[(0 * d + i) * howmany.v + k] = + (c_fwd + s_bck * twid_t[i]) * twid_a[0 * d + i] + a; + out[(1 * d + i) * howmany.v + k] = + (s_fwd + c_bck * twid_t[i]) * twid_a[1 * d + i]; + out[(2 * d + i) * howmany.v + k] = + (c_fwd + s_bck * twid_t[i]) * twid_a[2 * d + i] - a; + } + } + + for (int j = 1; j < 3; j++) { + // eqn. 11 + for (int i = 1; i < d; i++) { + ARMRAL_PRAGMA_AUTOVEC + for (int k = 0; k < howmany.v; k++) { + out[(d * j + i) * howmany.v + k] = + 2 * out[(d * j + i) * howmany.v + k] - + out[(d * j - i) * howmany.v + k]; + } + } + } + }; +}; + +template<> +utils::unique_ptr create_odd_radix_plan<3>(int n, + int howmany) { + switch (howmany) { + case 1: + return utils::make_unique>>(n, howmany); + case 2: + return utils::make_unique>>(n, howmany); + case 3: + return utils::make_unique>>(n, howmany); + default: + return utils::make_unique>(n, howmany); + } +} + +} // namespace armral::dct diff --git a/src/LowerPHY/DCT/radix_5_plan.cpp b/src/LowerPHY/DCT/radix_5_plan.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e3de86e267a31e8b88afa856ee435ed2a9aee36f --- /dev/null +++ b/src/LowerPHY/DCT/radix_5_plan.cpp @@ -0,0 +1,138 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#include "dynamic_value.hpp" +#include "odd_radix_plan.hpp" +#include "pragmas.hpp" +#include "static_value.hpp" + +namespace armral::dct { + +template +struct odd_radix_worker<5, Howmany> { + static void prepare_sub_transform(const armral_half *in, + const utils::pod_vector &buf, + int d, Howmany howmany) { + // Prepare the inputs for transforms A, C_0, C_1, S_0 and S_1 + for (int i = 0; i < d; i++) { + ARMRAL_PRAGMA_AUTOVEC + for (int k = 0; k < howmany.v; k++) { + const auto a_in = in[(5 * i + 2) * howmany.v + k]; + const auto x0 = in[(5 * i + 0) * howmany.v + k]; + const auto y0 = in[(5 * i + 4) * howmany.v + k]; + const auto x1 = in[(5 * i + 1) * howmany.v + k]; + const auto y1 = in[(5 * i + 3) * howmany.v + k]; + const auto c0_in = x0 + y0; + auto s0_in = x0 - y0; + const auto c1_in = x1 + y1; + auto s1_in = x1 - y1; + + if (i % 2 == 1) { + // (-1)^n term of eqn. 6b + s0_in = -s0_in; + s1_in = -s1_in; + } + + buf[(5 * i + 0) * howmany.v + k] = a_in; + buf[(5 * i + 1) * howmany.v + k] = c0_in; + buf[(5 * i + 2) * howmany.v + k] = c1_in; + buf[(5 * i + 3) * howmany.v + k] = s0_in; + buf[(5 * i + 4) * howmany.v + k] = s1_in; + } + } + }; + + static void + extract_sub_transform(armral_half *out, // NOLINT (LLVM bug 41393) + const utils::pod_vector &buf, int d, + Howmany howmany, + const utils::pod_vector &twid_t, + const utils::pod_vector &twid_a) { + ARMRAL_PRAGMA_AUTOVEC + for (int k = 0; k < howmany.v; k++) { + const auto a = buf[0 * howmany.v + k]; + const auto c0 = buf[1 * howmany.v + k]; + const auto c1 = buf[2 * howmany.v + k]; + const auto s0 = buf[3 * howmany.v + k]; + const auto s1 = buf[4 * howmany.v + k]; + + // Twiddle factors are both 1: + out[0 * d * howmany.v + k] = c0 + c1 + a; + // Because of symmetry in twid_a some twids can be shared in the below + out[1 * d * howmany.v + k] = s0 * twid_a[d * 2] + s1 * twid_a[d * 3]; + out[2 * d * howmany.v + k] = c0 * twid_a[d * 4] + c1 * twid_a[d * 5] - a; + out[3 * d * howmany.v + k] = s0 * twid_a[d * 3] - s1 * twid_a[d * 2]; + out[4 * d * howmany.v + k] = a - c0 * twid_a[d * 5] - c1 * twid_a[d * 4]; + } + + // Unrolled version of the loop over n_cs in equations 9 and 10 + for (int i = 1; i < d; i++) { + ARMRAL_PRAGMA_AUTOVEC + for (int k = 0; k < howmany.v; k++) { + const auto a = buf[(i * 5 + 0) * howmany.v + k]; + + const auto c0_fwd = buf[(i * 5 + 1) * howmany.v + k]; + const auto c1_fwd = buf[(i * 5 + 2) * howmany.v + k]; + const auto s0_fwd = buf[(i * 5 + 3) * howmany.v + k]; + const auto s1_fwd = buf[(i * 5 + 4) * howmany.v + k]; + + const auto c0_bck = buf[((d - i) * 5 + 1) * howmany.v + k]; + const auto c1_bck = buf[((d - i) * 5 + 2) * howmany.v + k]; + const auto s0_bck = buf[((d - i) * 5 + 3) * howmany.v + k]; + const auto s1_bck = buf[((d - i) * 5 + 4) * howmany.v + k]; + + out[(0 * d + i) * howmany.v + k] = + (c0_fwd + s0_bck * twid_t[0 + i]) * twid_a[i] + + (c1_fwd + s1_bck * twid_t[d + i]) * twid_a[d + i] + a; + + out[(1 * d + i) * howmany.v + k] = + (s0_fwd + c0_bck * twid_t[0 + i]) * twid_a[d * 2 + i] + + (s1_fwd + c1_bck * twid_t[d + i]) * twid_a[d * 3 + i]; + + out[(2 * d + i) * howmany.v + k] = + (c0_fwd + s0_bck * twid_t[0 + i]) * twid_a[d * 4 + i] + + (c1_fwd + s1_bck * twid_t[d + i]) * twid_a[d * 5 + i] - a; + + out[(3 * d + i) * howmany.v + k] = + (s0_fwd + c0_bck * twid_t[0 + i]) * twid_a[d * 6 + i] + + (s1_fwd + c1_bck * twid_t[d + i]) * twid_a[d * 7 + i]; + + out[(4 * d + i) * howmany.v + k] = + (c0_fwd + s0_bck * twid_t[0 + i]) * twid_a[d * 8 + i] + + (c1_fwd + s1_bck * twid_t[d + i]) * twid_a[d * 9 + i] + a; + } + } + + for (int j = 1; j < 5; j++) { + // eqn. 11 + for (int i = 1; i < d; i++) { + ARMRAL_PRAGMA_AUTOVEC + for (int k = 0; k < howmany.v; k++) { + out[(d * j + i) * howmany.v + k] = + 2 * out[(d * j + i) * howmany.v + k] - + out[(d * j - i) * howmany.v + k]; + } + } + } + }; +}; + +template<> +utils::unique_ptr create_odd_radix_plan<5>(int n, + int howmany) { + switch (howmany) { + case 1: + return utils::make_unique>>(n, howmany); + case 2: + return utils::make_unique>>(n, howmany); + case 3: + return utils::make_unique>>(n, howmany); + default: + return utils::make_unique>(n, howmany); + } +} + +} // namespace armral::dct diff --git a/src/LowerPHY/DCT/radix_7_plan.cpp b/src/LowerPHY/DCT/radix_7_plan.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bbc21f2ce49b9f015693ebef4e6633a7aeaecbde --- /dev/null +++ b/src/LowerPHY/DCT/radix_7_plan.cpp @@ -0,0 +1,178 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#include "dynamic_value.hpp" +#include "odd_radix_plan.hpp" +#include "pragmas.hpp" +#include "static_value.hpp" + +namespace armral::dct { + +template +struct odd_radix_worker<7, Howmany> { + static void prepare_sub_transform(const armral_half *in, + const utils::pod_vector &buf, + int d, Howmany howmany) { + // Prepare the inputs for transforms A, C_n and S_n (n = 0 to 2) + for (int i = 0; i < d; i++) { + ARMRAL_PRAGMA_AUTOVEC + for (int k = 0; k < howmany.v; k++) { + const auto a_in = in[(7 * i + 3) * howmany.v + k]; + const auto a0 = in[(7 * i + 0) * howmany.v + k]; + const auto a1 = in[(7 * i + 1) * howmany.v + k]; + const auto a2 = in[(7 * i + 2) * howmany.v + k]; + const auto b0 = in[(7 * i + 6) * howmany.v + k]; + const auto b1 = in[(7 * i + 5) * howmany.v + k]; + const auto b2 = in[(7 * i + 4) * howmany.v + k]; + const auto c0_in = a0 + b0; + auto s0_in = a0 - b0; + const auto c1_in = a1 + b1; + auto s1_in = a1 - b1; + const auto c2_in = a2 + b2; + auto s2_in = a2 - b2; + + if (i % 2 == 1) { + // (-1)^n term of eqn. 6b + s0_in = -s0_in; + s1_in = -s1_in; + s2_in = -s2_in; + } + + buf[(7 * i + 0) * howmany.v + k] = a_in; + buf[(7 * i + 1) * howmany.v + k] = c0_in; + buf[(7 * i + 2) * howmany.v + k] = c1_in; + buf[(7 * i + 3) * howmany.v + k] = c2_in; + buf[(7 * i + 4) * howmany.v + k] = s0_in; + buf[(7 * i + 5) * howmany.v + k] = s1_in; + buf[(7 * i + 6) * howmany.v + k] = s2_in; + } + } + }; + + static void + extract_sub_transform(armral_half *out, // NOLINT (LLVM bug 41393) + const utils::pod_vector &buf, int d, + Howmany howmany, + const utils::pod_vector &twid_t, + const utils::pod_vector &twid_a) { + ARMRAL_PRAGMA_AUTOVEC + for (int k = 0; k < howmany.v; k++) { + const auto a = buf[0 * howmany.v + k]; + const auto c0 = buf[1 * howmany.v + k]; + const auto c1 = buf[2 * howmany.v + k]; + const auto c2 = buf[3 * howmany.v + k]; + const auto s0 = buf[4 * howmany.v + k]; + const auto s1 = buf[5 * howmany.v + k]; + const auto s2 = buf[6 * howmany.v + k]; + + // Twiddle factors are both 1: + out[0 * d * howmany.v + k] = c0 + c1 + c2 + a; + // Because of symmetry in twid_a some twids can be shared in the below + out[1 * d * howmany.v + k] = + s0 * twid_a[3 * d] + s1 * twid_a[4 * d] + s2 * twid_a[5 * d]; + out[2 * d * howmany.v + k] = + c0 * twid_a[6 * d] + c1 * twid_a[7 * d] + c2 * twid_a[8 * d] - a; + out[3 * d * howmany.v + k] = + s0 * twid_a[4 * d] - s1 * twid_a[5 * d] - s2 * twid_a[3 * d]; + out[4 * d * howmany.v + k] = + a - c0 * twid_a[8 * d] - c1 * twid_a[6 * d] - c2 * twid_a[7 * d]; + out[5 * d * howmany.v + k] = + s0 * twid_a[5 * d] - s1 * twid_a[3 * d] + s2 * twid_a[4 * d]; + out[6 * d * howmany.v + k] = + c0 * twid_a[7 * d] + c1 * twid_a[8 * d] + c2 * twid_a[6 * d] - a; + } + + // Unrolled version of the loop over n_cs in equations 9 and 10 + for (int i = 1; i < d; i++) { + ARMRAL_PRAGMA_AUTOVEC + for (int k = 0; k < howmany.v; k++) { + const auto a = buf[(i * 7 + 0) * howmany.v + k]; + + const auto c0_fwd = buf[(i * 7 + 1) * howmany.v + k]; + const auto c1_fwd = buf[(i * 7 + 2) * howmany.v + k]; + const auto c2_fwd = buf[(i * 7 + 3) * howmany.v + k]; + const auto s0_fwd = buf[(i * 7 + 4) * howmany.v + k]; + const auto s1_fwd = buf[(i * 7 + 5) * howmany.v + k]; + const auto s2_fwd = buf[(i * 7 + 6) * howmany.v + k]; + + const auto c0_bck = buf[((d - i) * 7 + 1) * howmany.v + k]; + const auto c1_bck = buf[((d - i) * 7 + 2) * howmany.v + k]; + const auto c2_bck = buf[((d - i) * 7 + 3) * howmany.v + k]; + const auto s0_bck = buf[((d - i) * 7 + 4) * howmany.v + k]; + const auto s1_bck = buf[((d - i) * 7 + 5) * howmany.v + k]; + const auto s2_bck = buf[((d - i) * 7 + 6) * howmany.v + k]; + + const auto t0 = twid_t[0 * d + i]; + const auto t1 = twid_t[1 * d + i]; + const auto t2 = twid_t[2 * d + i]; + + out[(0 * d + i) * howmany.v + k] = + (c0_fwd + s0_bck * t0) * twid_a[0 * d + i] + + (c1_fwd + s1_bck * t1) * twid_a[1 * d + i] + + (c2_fwd + s2_bck * t2) * twid_a[2 * d + i] + a; + + out[(1 * d + i) * howmany.v + k] = + (s0_fwd + c0_bck * t0) * twid_a[3 * d + i] + + (s1_fwd + c1_bck * t1) * twid_a[4 * d + i] + + (s2_fwd + c2_bck * t2) * twid_a[5 * d + i]; + + out[(2 * d + i) * howmany.v + k] = + (c0_fwd + s0_bck * t0) * twid_a[6 * d + i] + + (c1_fwd + s1_bck * t1) * twid_a[7 * d + i] + + (c2_fwd + s2_bck * t2) * twid_a[8 * d + i] - a; + + out[(3 * d + i) * howmany.v + k] = + (s0_fwd + c0_bck * t0) * twid_a[9 * d + i] + + (s1_fwd + c1_bck * t1) * twid_a[10 * d + i] + + (s2_fwd + c2_bck * t2) * twid_a[11 * d + i]; + + out[(4 * d + i) * howmany.v + k] = + (c0_fwd + s0_bck * t0) * twid_a[12 * d + i] + + (c1_fwd + s1_bck * t1) * twid_a[13 * d + i] + + (c2_fwd + s2_bck * t2) * twid_a[14 * d + i] + a; + + out[(5 * d + i) * howmany.v + k] = + (s0_fwd + c0_bck * t0) * twid_a[15 * d + i] + + (s1_fwd + c1_bck * t1) * twid_a[16 * d + i] + + (s2_fwd + c2_bck * t2) * twid_a[17 * d + i]; + + out[(6 * d + i) * howmany.v + k] = + (c0_fwd + s0_bck * t0) * twid_a[18 * d + i] + + (c1_fwd + s1_bck * t1) * twid_a[19 * d + i] + + (c2_fwd + s2_bck * t2) * twid_a[20 * d + i] - a; + } + } + + for (int j = 1; j < 7; j++) { + // eqn. 11 + for (int i = 1; i < d; i++) { + ARMRAL_PRAGMA_AUTOVEC + for (int k = 0; k < howmany.v; k++) { + out[(d * j + i) * howmany.v + k] = + 2 * out[(d * j + i) * howmany.v + k] - + out[(d * j - i) * howmany.v + k]; + } + } + } + }; +}; + +template<> +utils::unique_ptr create_odd_radix_plan<7>(int n, + int howmany) { + switch (howmany) { + case 1: + return utils::make_unique>>(n, howmany); + case 2: + return utils::make_unique>>(n, howmany); + case 3: + return utils::make_unique>>(n, howmany); + default: + return utils::make_unique>(n, howmany); + } +} + +} // namespace armral::dct diff --git a/src/LowerPHY/DCT/static_value.hpp b/src/LowerPHY/DCT/static_value.hpp new file mode 100644 index 0000000000000000000000000000000000000000..856e44036823e1574747c77045fc70baa8ac4d6f --- /dev/null +++ b/src/LowerPHY/DCT/static_value.hpp @@ -0,0 +1,18 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#pragma once + +template +class static_value { + /* Utility class to allow DCT stages to provide specialisations for certain + values of some parameter while still allowing arbitrary values of that + parameter to be chosen at runtime (via the partner class dynamic_value). */ +public: + static_value(int /* unused */) {} + + static constexpr int v = V; +}; diff --git a/test/LowerPHY/DCT/main.cpp b/test/LowerPHY/DCT/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4ce45a407e8e96a834627aaf2bb1256b20a1c5cd --- /dev/null +++ b/test/LowerPHY/DCT/main.cpp @@ -0,0 +1,130 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#include "armral.h" +#include "f16_utils.hpp" + +#include +#include +#include +#include + +namespace { + +void do_reference_transform(int n, int howmany, const armral_half *in, + int istride, int idist, armral_half *out, + int ostride, int odist) { + // Calculate C, the matrix for N-point DCT-III + std::vector mat(n * n); + for (int i = 0; i < n; i++) { + mat[i] = 0.5; + for (int j = 1; j < n; j++) { + mat[j * n + i] = std::cos(M_PI * (0.5 + i) * j / n); + } + } + + // Apply it by naive gemv, with scale by 1/N + for (int k = 0; k < howmany; k++) { + for (int j = 0; j < n; j++) { + double y = 0; + for (int i = 0; i < n; i++) { + y += in[i * istride + k * idist] * mat[i * n + j]; + } + out[j * ostride + k * odist] = static_cast(y / n); + } + } +} + +bool dct_test(int n, int howmany, int istride, int idist, int ostride, + int odist) { + // Machine epsilon scaled by transform complexity + const double base_tol = armral::utils::eps() * n * std::log2(n); + bool passed = true; + + const auto verify_values_match = + [&passed, howmany, istride, idist, ostride, odist, + n](const std::vector &res, + const std::vector &ref, int n_samples, double tol, + int ref_stride, int ref_dist) { + for (int k = 0; k < howmany; k++) { + for (int i = 0; i < n_samples; i++) { + const double refx = ref[i * ref_stride + k * ref_dist]; + const double resx = res[i * ostride + k * odist]; + const double err = std::abs((refx - resx) / refx); + + if (err > tol) { + if (passed) { + // First failure: print table header + passed = false; + std::cerr << "Error! [DCT] n=" << n << " howmany=" << howmany + << " istride=" << istride << " idist=" << idist + << " ostride=" << ostride << " odist=" << odist + << " tol=" << tol << '\n'; + std::cerr << "k\ti\tres\tref\terr\n"; + } + std::cerr << k << '\t' << i << '\t' << resx << '\t' << refx + << '\t' << err << '\n'; + } + } + } + }; + + // Initialise input values + armral::utils::f16_random random; + const auto isize = std::max(howmany, istride) * std::max(n, idist); + const auto x = random.vector(isize, 1, 2); + + // Do the DCT into y + const auto osize = std::max(howmany, ostride) * std::max(n, odist); + std::vector y(osize); + armral_dct_plan_t *plan; + armral_dct_create_batch_plan_f16(&plan, n, howmany, istride, idist, ostride, + odist); + armral_dct_execute_f16(plan, x.data(), y.data()); + armral_dct_destroy_plan_f16(&plan); + + // First element of DCT should be the sum of the input sequence scaled by 2 + std::vector sums(howmany); + for (int k = 0; k < howmany; k++) { + double sum = 0; + for (int i = 0; i < n; i++) { + sum += x[i * istride + k * idist]; + } + sums[k] = static_cast(sum) * 2; + } + // ref_stride is unused in the following call because n_samples is 1 + verify_values_match(y, sums, 1, base_tol, 1, 1); + + // Then do inverse DCT and compare with the input sequence + std::vector should_match_x(n * howmany); + do_reference_transform(n, howmany, y.data(), ostride, odist, + should_match_x.data(), ostride, odist); + // Double tolerance as we have done 2 transforms + verify_values_match(x, should_match_x, n, 2 * base_tol, ostride, odist); + + return passed; +} + +bool do_interleaved_test(int n, int howmany) { + return dct_test(n, howmany, howmany, 1, howmany, 1); +} + +} // namespace + +int main() { + bool passed = true; + + const std::vector ns{2, 3, 4, 5, 6, 7, 8, 9, + 12, 25, 49, 54, 896, 300, 81 * 8, 49 * 5 * 3}; + const std::vector howmanys{1, 2, 4, 7, 14}; + + for (const int n : ns) { + for (const int howmany : howmanys) { + passed &= do_interleaved_test(n, howmany); + } + } + exit(passed ? EXIT_SUCCESS : EXIT_FAILURE); +} diff --git a/utils/f16_utils.hpp b/utils/f16_utils.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d1949f40c97405839ea9bf8b0a78abebf2843693 --- /dev/null +++ b/utils/f16_utils.hpp @@ -0,0 +1,59 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#pragma once + +#include "armral.h" +#include "rng.hpp" + +#include + +namespace armral::utils { + +class f16_random : public base_random { +public: + f16_random(std::initializer_list seeds = {42}) + : base_random(seeds) {} + + static constexpr armral_half default_min = -1; + static constexpr armral_half default_max = 1; + + armral_half one(armral_half min = default_min, + armral_half max = default_max) { + return rand(min, max); + } + + std::vector vector(size_t len, armral_half min = default_min, + armral_half max = default_max) { + return base_random::vector_impl(len, min, max); + } +}; + +template +struct float_uint; + +template<> +struct float_uint { + using type = uint16_t; +}; + +template +using float_uint_t = typename float_uint::type; + +// Get machine epsilon b^-(p-1) +template +inline T eps() { + union { + float_uint_t ival; + T fval; + } val; + + val.fval = (T)1; + ++val.ival; + return val.fval - (T)1; +} + +} // namespace armral::utils diff --git a/utils/pod_vector.hpp b/utils/pod_vector.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a1c48a307160d3b4abbdf84ad186f4b19f7751c2 --- /dev/null +++ b/utils/pod_vector.hpp @@ -0,0 +1,511 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#pragma once + +#include "reallocator.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace armral::utils { + +template +constexpr bool is_input_iterator = + std::is_base_of_v::iterator_category>; + +/** A vector-like container for value types that will survive not having a + * constructor called (e.g. float/double/std::complex). Calling .resize() on + * this container will leave any newly allocated data uninitialised, unlike + * std::vector which will default-initialise it. + */ +template +class pod_vector { +public: + using size_type = std::size_t; + using value_type = T; + using pointer = T *; + using const_pointer = const T *; + using iterator = pointer; + using const_iterator = const_pointer; + using reference = T &; + using reverse_iterator = std::reverse_iterator; + using const_reverse_iterator = std::reverse_iterator; + +private: + static constexpr size_type growth_factor{2}; + + value_type *m_data = nullptr; + size_type m_capacity = 0; + size_type m_size = 0; + R m_realloc; + + template + friend class pod_vector; + + /** + * Allocates or reallocates (copying the data) the data_pointer to the + * specified size (in number of elements of T), updating the capacity in + * the process. + */ + void realloc(size_type size) { + if (size != m_capacity) { + m_data = static_cast(m_realloc.reallocate(m_data, sizeof(T) * size)); + m_capacity = size; + } + } + +public: + pod_vector(const R &realloc = R()) : m_realloc{realloc} {} + + pod_vector(const pod_vector &other) { + *this = other; + } + + pod_vector(pod_vector &&other) { + *this = std::move(other); + } + + pod_vector(size_type size, const R &realloc = R()) : m_realloc{realloc} { + resize(size); + } + + pod_vector(size_type size, const T &value, const R &realloc = R()) + : m_realloc{realloc} { + assign(size, value); + } + + template>> + pod_vector(It1 first, It1 last, const R &realloc = R()) : m_realloc{realloc} { + assign(first, last); + } + + ~pod_vector() { + assert(m_size <= m_capacity); + assert(m_data || m_capacity == 0); + if (m_data) { + m_realloc.deallocate(m_data); + } + } + + pod_vector &operator=(pod_vector &&other) { + swap(other); + return *this; + } + + pod_vector &operator=(const pod_vector &other) { + if (&other == this) { + return *this; + } + realloc(other.m_capacity); + m_size = other.m_size; + std::memcpy((void *)m_data, (const void *)other.m_data, sizeof(T) * m_size); + m_realloc = other.m_realloc; + return *this; + } + + void assign(size_type size, const T &value) { + resize(size); + std::fill_n(m_data, size, value); + } + + template>> + void assign(It1 first, It1 last) { + resize(std::distance(first, last)); + std::copy(first, last, m_data); + } + + reference at(size_type pos) { + assert(pos >= 0); + assert(pos < m_size); + assert(m_data); + return m_data[pos]; + } + + reference at(size_type pos) const { + assert(pos >= 0); + assert(pos < m_size); + assert(m_data); + return m_data[pos]; + } + + reference operator[](size_type pos) { + return m_data[pos]; + } + + reference operator[](size_type pos) const { + return m_data[pos]; + } + + reference front() { + return at(0); + } + + reference front() const { + return at(0); + } + + reference back() { + return at(m_size - 1); + } + + reference back() const { + return at(m_size - 1); + } + + pointer data() { + return m_data; + } + + const_pointer data() const { + return m_data; + } + + iterator begin() { + return m_data; + } + + const_iterator begin() const { + return m_data; + } + + const_iterator cbegin() const { + return m_data; + } + + iterator end() { + return m_data + m_size; + } + + const_iterator end() const { + return m_data + m_size; + } + + const_iterator cend() const { + return m_data + m_size; + } + + reverse_iterator rbegin() { + return {end()}; + } + + const_reverse_iterator rbegin() const { + return {end()}; + } + + const_reverse_iterator crbegin() const { + return {cend()}; + } + + reverse_iterator rend() { + return {begin()}; + } + + const_reverse_iterator rend() const { + return {begin()}; + } + + const_reverse_iterator crend() const { + return {cbegin()}; + } + + bool empty() const { + return m_size == 0; + } + + size_type size() const { + return m_size; + } + + size_type capacity() const { + return m_capacity; + } + + void reserve(size_type new_cap) { + if (new_cap <= m_capacity) { + return; + } + realloc(new_cap); + assert(m_data); + } + + void shrink_to_fit() { + realloc(m_size); + assert(m_data || m_size == 0); + } + + void resize(size_type count) { + realloc(count); + assert(m_data || count == 0); + m_size = count; + } + + void resize(size_type count, const value_type &value) { + realloc(count); + assert(m_data || count == 0); + if (count > m_size) { + std::fill(m_data + m_size, m_data + count, value); + } + m_size = count; + } + + void clear() { + resize(0); + } + + void swap(pod_vector &other) { + std::swap(m_data, other.m_data); + std::swap(m_capacity, other.m_capacity); + std::swap(m_size, other.m_size); + } + + void push_back(T value) { + if (m_size == m_capacity) { + size_type new_capacity = m_capacity == 0 ? 1 : m_capacity * growth_factor; + realloc(new_capacity); + } + m_data[m_size++] = std::move(value); + } +}; + +/** A wrapper around void* equivalent roughly to void&. + * void& is disallowed by the language, but if all we want + * is to take the address of it there's no problem. */ +class void_ref { + void *m_ptr; + +public: + void_ref(void *ptr) : m_ptr(ptr) {} + + void *operator&() { + return m_ptr; + } + + const void *operator&() const { + return m_ptr; + } +}; + +/** A vector-like container for unknown value types (roughly a + * pod_vector). This allows us to keep the memory management bits + * that we like from pod_vector while not caring about the exact underlying + * type. + */ +template +class pod_vector { +public: + using size_type = std::size_t; + using pointer = void *; + using const_pointer = const void *; + using reference = void_ref; + +private: + void *m_data = nullptr; + size_type m_capacity = 0; + size_type m_size = 0; + R m_realloc; + + /** In addition to the parameters from a standard pod_vector, we must also + * keep the size of each element, since we don't know it otherwise. */ + size_type m_elem_size = 0; + +public: + pod_vector() = default; + + template + pod_vector(const pod_vector &other) { + *this = other; + } + + template + pod_vector(pod_vector &&other) { + *this = std::move(other); + } + + pod_vector(const pod_vector &other) { + *this = other; + } + + pod_vector(pod_vector &&other) { + *this = std::move(other); + } + + /** We can move into this from any other pod_vector, since we can know the + * element size by doing sizeof(T) */ + template + pod_vector &operator=(pod_vector &&other) { + if ((void *)this == (void *)&other) { + return *this; + } + if (m_data) { + m_realloc.deallocate(m_data); + } + m_data = other.m_data; + m_capacity = other.m_capacity; + m_size = other.m_size; + m_elem_size = sizeof(T); + other.m_data = nullptr; + other.m_capacity = other.m_size = 0; + return *this; + } + + template + pod_vector &operator=(const pod_vector &other) { + if ((void *)this == (void *)&other) { + return *this; + } + if (other.m_capacity * sizeof(T) != m_capacity * m_elem_size) { + m_data = m_realloc.reallocate(m_data, other.m_capacity * sizeof(T)); + } + m_capacity = other.m_capacity; + m_size = other.m_size; + m_elem_size = sizeof(T); + std::memcpy(m_data, other.m_data, m_elem_size * m_size); + return *this; + } + + /** We can move into this from any other pod_vector, since we already + * have the (original) element size. */ + pod_vector &operator=(pod_vector &&other) { + if (this == &other) { + return *this; + } + if (m_data != nullptr) { + m_realloc.deallocate(m_data); + } + m_data = other.m_data; + m_capacity = other.m_capacity; + m_size = other.m_size; + m_elem_size = other.m_elem_size; + other.m_data = nullptr; + other.m_capacity = other.m_size = other.m_elem_size = 0; + return *this; + } + + pod_vector &operator=(const pod_vector &other) { + if (&other == this) { + return *this; + } + if (other.m_capacity * other.m_elem_size != m_capacity * m_elem_size) { + m_data = + m_realloc.reallocate(m_data, other.m_capacity * other.m_elem_size); + } + m_capacity = other.m_capacity; + m_size = other.m_size; + m_elem_size = other.m_elem_size; + std::memcpy(m_data, other.m_data, m_elem_size * m_size); + return *this; + } + + ~pod_vector() { + assert(m_size <= m_capacity); + assert(m_data || m_capacity == 0); + if (m_data != nullptr) { + m_realloc.deallocate(m_data); + } + } + + reference at(size_type pos) { + assert(pos >= 0); + assert(pos < m_size); + assert(m_data); + uintptr_t data = reinterpret_cast(m_data); + return reinterpret_cast(data + pos * m_elem_size); + } + + reference at(size_type pos) const { + assert(pos >= 0); + assert(pos < m_size); + assert(m_data); + uintptr_t data = reinterpret_cast(m_data); + return reinterpret_cast(data + pos * m_elem_size); + } + + reference operator[](size_type pos) { + uintptr_t data = reinterpret_cast(m_data); + return reinterpret_cast(data + pos * m_elem_size); + } + + reference operator[](size_type pos) const { + uintptr_t data = reinterpret_cast(m_data); + return reinterpret_cast(data + pos * m_elem_size); + } + + reference front() { + return at(0); + } + + reference front() const { + return at(0); + } + + reference back() { + return at(m_size - 1); + } + + reference back() const { + return at(m_size - 1); + } + + pointer data() { + return m_data; + } + + const_pointer data() const { + return m_data; + } + + bool empty() const { + return m_size == 0; + } + + size_type size() const { + return m_size; + } + + size_type capacity() const { + return m_capacity; + } +}; + +template +struct is_vec_type : std::false_type {}; + +template +struct is_vec_type> : std::true_type {}; + +template +struct is_vec_type> : std::true_type {}; + +/** + * Utility to help check that a template parameter is a vector type + * (either std::vector or @ref armral::utils::pod_vector for now) + */ +template +constexpr bool is_vec_type_v = is_vec_type::value; + +template +struct has_complex_value_type : std::false_type {}; + +template +struct has_complex_value_type>> : std::true_type {}; + +template +struct has_complex_value_type, R>> + : std::true_type {}; + +template +constexpr bool has_complex_value_type_v = has_complex_value_type::value; + +} // namespace armral::utils diff --git a/utils/pragmas.hpp b/utils/pragmas.hpp new file mode 100644 index 0000000000000000000000000000000000000000..c32de15c1ec7e28d988fd5cdafd178db3da5f027 --- /dev/null +++ b/utils/pragmas.hpp @@ -0,0 +1,17 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#pragma once + +#if __clang__ + +#define ARMRAL_PRAGMA_AUTOVEC _Pragma("clang loop vectorize(enable)") + +#else + +#define ARMRAL_PRAGMA_AUTOVEC _Pragma("GCC ivdep") + +#endif diff --git a/utils/reallocator.hpp b/utils/reallocator.hpp new file mode 100644 index 0000000000000000000000000000000000000000..63a5e2f2701020907ccadc1de0c474b1bdd74f86 --- /dev/null +++ b/utils/reallocator.hpp @@ -0,0 +1,39 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#pragma once + +#include + +namespace armral::utils { + +/** A derivative of the standard allocator interface which offers reallocation + * of previously allocated memory. std::realloc is used for all reallocations, + * and like std::realloc an exact number of bytes must be specified for + * reallocation. + */ +class reallocator { +public: + using pointer = void *; + using const_pointer = const pointer; + using size_type = std::size_t; + + constexpr reallocator() noexcept = default; + + constexpr reallocator(const reallocator &other) noexcept = default; + + ~reallocator() = default; + + static pointer reallocate(const_pointer ptr, size_type new_size) { + return static_cast(std::realloc(ptr, new_size)); + } + + static void deallocate(const_pointer ptr) { + std::free(ptr); + } +}; + +} // namespace armral::utils diff --git a/utils/rng.cpp b/utils/rng.cpp index 23e03edaf64b785ca024bd3690cca74cbb547de1..1211019eecd526b07a9f7bffc16f1d339b694f90 100644 --- a/utils/rng.cpp +++ b/utils/rng.cpp @@ -6,6 +6,7 @@ */ #include +#include "armral.h" #include "rng.hpp" namespace armral::utils { @@ -42,6 +43,13 @@ float32_t linear_congruential_generator::one(random_state *state) { return (float32_t)x / 0x80000000U; } +template<> +armral_half +linear_congruential_generator::one(random_state *state) { + return static_cast( + linear_congruential_generator::one(state)); +} + template<> double linear_congruential_generator::one(random_state *state) { auto x = lcg_step(state->seed); diff --git a/utils/rng.hpp b/utils/rng.hpp index d366a8ed7840558b5529c3bae3eedc680da3b0a3..a2da3d02341e3be596138d1ee2a801281917afcd 100644 --- a/utils/rng.hpp +++ b/utils/rng.hpp @@ -7,6 +7,8 @@ #pragma once +#include "armral.h" + #include #include #include @@ -47,8 +49,9 @@ public: * @param[out] max The upper bound of the generated value. * @returns a pseudo-randomly generated floating-point value. */ - template, bool> = true> + template || + std::is_same_v, + bool> = true> T one(random_state *state, const T min, const T max) { return (max - min) * one(state) + min; } diff --git a/utils/unique_ptr.hpp b/utils/unique_ptr.hpp new file mode 100644 index 0000000000000000000000000000000000000000..3e6f3f6c2da94f168ca83dd69fe11f127cd4dd73 --- /dev/null +++ b/utils/unique_ptr.hpp @@ -0,0 +1,74 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#pragma once + +#include +#include +#include + +namespace armral::utils { + +template +class unique_ptr { + /* Custom minimal implementation of unique_ptr. std::unique_ptr may invoke + operator new, which introduces dependency on libstdc++. This version uses + malloc and placement new, which has no such dependency. */ +public: + explicit unique_ptr(T *ptr) noexcept : m_ptr(ptr) {} + + /* Delete copy-assignment and copy-constructor - if you can copy it then it's + not unique! */ + unique_ptr(const unique_ptr &) = delete; + unique_ptr &operator=(const unique_ptr &) = delete; + + /* Converting constructor - this is required for implicit conversion from + unique_ptr to unique_ptr. */ + template && + std::is_convertible_v, + bool> = true> + unique_ptr(unique_ptr &&other) noexcept : m_ptr{other.release()} {} + + unique_ptr(unique_ptr &&other) noexcept : m_ptr{other.release()} {} + + ~unique_ptr() { + // Check for null ptr - this may have been moved or released + if (m_ptr) { + m_ptr->~T(); + free(m_ptr); + } + } + + T *release() noexcept { + T *copy = m_ptr; + m_ptr = nullptr; + return copy; + } + + T *get() const noexcept { + return m_ptr; + } + + explicit operator bool() const noexcept { + return m_ptr; + } + + T *operator->() const noexcept { + return m_ptr; + } + +private: + T *m_ptr; +}; + +template +unique_ptr make_unique(Args &&...args) { + T *buf = (T *)malloc(sizeof(T)); + T *obj = new (buf) T(std::forward(args)...); + return unique_ptr(obj); +} + +} // namespace armral::utils