diff --git a/CMakeLists.txt b/CMakeLists.txt
index 33835c7a0bd8d0360fbd696590718a9ad369a06a..a1136538d119c69ec40fb37c5d697c9b3c210633 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -59,6 +59,19 @@ set(ARMRAL_LIB_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Correlation/arm_correlation.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/DCT/dct_interface.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/DCT/dct_plan.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/DCT/direct_2_point_plan.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/DCT/direct_3_point_plan.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/DCT/direct_4_point_plan.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/DCT/direct_5_point_plan.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/DCT/direct_7_point_plan.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/DCT/direct_8_point_plan.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/DCT/direct_9_point_plan.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/DCT/radix_2_plan.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/DCT/radix_3_plan.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/DCT/radix_5_plan.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/DCT/radix_7_plan.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/bluestein.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c
@@ -575,6 +588,7 @@ if(BUILD_TESTING)
   add_armral_test(turbo_rate_recovery
                   test/UpperPHY/Turbo/Single/RateRecovery/main.cpp)
   add_armral_test(svd test/MatrixFactorizations/SVD/main.cpp)
+  add_armral_test(dct test/LowerPHY/DCT/main.cpp)
 
   add_armral_bench(
     matrix_inv_batch_general
@@ -782,6 +796,7 @@ if(BUILD_TESTING)
   add_armral_bench(turbo_rate_recovery
                    bench/UpperPHY/Turbo/Single/RateRecovery/main.cpp)
   add_armral_bench(svd bench/MatrixFactorizations/SVD/main.cpp)
+  add_armral_bench(dct_f16 bench/LowerPHY/DCT/F16/main.cpp)
 endif()
 
 if(BUILD_EXAMPLES)
@@ -816,6 +831,7 @@ if(BUILD_EXAMPLES)
   add_armral_example(examples/fft_2d_cf32_example.c 4 5)
   add_armral_example(examples/modulation_example.c)
   add_armral_example(examples/polar_example.cpp 128 100 35)
+  add_armral_example(examples/dct_f16_interleaved_example.c 12 8)
 endif()
 
 if(BUILD_SIMULATION)
diff --git a/bench/LowerPHY/DCT/F16/bench.py b/bench/LowerPHY/DCT/F16/bench.py
new file mode 100755
index 0000000000000000000000000000000000000000..49f06a2c6e047a1eeb0e4747e65f63d22690779b
--- /dev/null
+++ b/bench/LowerPHY/DCT/F16/bench.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+# Arm RAN Acceleration Library
+# SPDX-FileCopyrightText: <text>Copyright 2025 Arm Limited and/or its
+# affiliates <open-source-office@arm.com></text>
+# SPDX-License-Identifier: BSD-3-Clause
+
+import json
+from pathlib import Path
+import os
+
+
+def get_path(x):
+    return x if Path(x).is_file() else os.path.join("armral", x)
+
+
+exe_name = get_path("bench_dct_f16")
+
+reps_base = 10000000
+howmanys = [1, 2, 8]
+ns = [2, 3, 4, 5, 7, 9, 12, 20, 36, 72, 100, 4 * 49]
+
+j = {"exe_name": exe_name, "cases": []}
+
+for n in ns:
+    for howmany in howmanys:
+        if howmany == 1:
+            name = f"dct_f16_{n}_point_{howmany}"
+        else:
+            name = f"dct_f16_{n}_point_{howmany}_interleaved"
+        case = {
+            "name": name,
+            "args": f"{n} {howmany} {howmany} 1 {howmany} 1",
+            "reps": reps_base // n,
+        }
+
+        j["cases"].append(case)
+
+print(json.dumps(j))
diff --git a/bench/LowerPHY/DCT/F16/main.cpp b/bench/LowerPHY/DCT/F16/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..95995019de489cdbde8378f1661259c83a71ad73
--- /dev/null
+++ b/bench/LowerPHY/DCT/F16/main.cpp
@@ -0,0 +1,65 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: <text>Copyright 2025 Arm Limited and/or its
+    affiliates <open-source-office@arm.com></text>
+    SPDX-License-Identifier: BSD-3-Clause
+*/
+#include "armral.h"
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+namespace {
+
+void run_dct_f16_perf(uint32_t n, uint32_t howmany, uint32_t istride,
+                      uint32_t idist, uint32_t ostride, uint32_t odist,
+                      uint32_t num_reps) {
+  printf("[DCT f16] - n = %u, howmany = %u, number of iterations = "
+         "%u\n",
+         n, howmany, num_reps);
+  printf("            istride = %u, idist = %u, ostride = %u, odist = %u\n",
+         istride, idist, ostride, odist);
+
+  const std::vector<armral_half> x(n * howmany);
+  std::vector<armral_half> y(n * howmany);
+
+  armral_dct_plan_t *p = nullptr;
+  armral_dct_create_batch_plan_f16(&p, n, howmany, istride, idist, ostride,
+                                   odist);
+  assert(p);
+  for (uint32_t i = 0; i < num_reps; ++i) {
+    armral_dct_execute_f16(p, x.data(), y.data());
+  }
+
+  armral_dct_destroy_plan_f16(&p);
+}
+
+} // anonymous namespace
+
+int main(int argc, char **argv) {
+  if (argc != 8) {
+    // n
+    // howmany
+    // istride
+    // idist
+    // ostride
+    // odist
+    // nreps    - The number of times to repeat the function
+    fprintf(stderr, "usage: %s n howmany istride idist ostride odist nreps\n",
+            argv[0]);
+    exit(EXIT_FAILURE);
+  }
+  const auto n = (uint32_t)atoi(argv[1]);
+  const auto howmany = (uint32_t)(atoi(argv[2]));
+  const auto istride = (uint32_t)(atoi(argv[3]));
+  const auto idist = (uint32_t)(atoi(argv[4]));
+  const auto ostride = (uint32_t)(atoi(argv[5]));
+  const auto odist = (uint32_t)(atoi(argv[6]));
+  const auto nreps = (uint32_t)(atoi(argv[7]));
+
+  run_dct_f16_perf(n, howmany, istride, idist, ostride, odist, nreps);
+
+  return EXIT_SUCCESS;
+}
diff --git a/examples/dct_f16_interleaved_example.c b/examples/dct_f16_interleaved_example.c
new file mode 100644
index 0000000000000000000000000000000000000000..d7da93dde248b2badd24489ca058cf59acef8c6e
--- /dev/null
+++ b/examples/dct_f16_interleaved_example.c
@@ -0,0 +1,103 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: <text>Copyright 2025 Arm Limited and/or its
+    affiliates <open-source-office@arm.com></text>
+    SPDX-License-Identifier: BSD-3-Clause
+*/
+#include "armral.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+// This function shows how to create a plan and execute a DCT on interleaved
+// data using the ArmRAL library
+static void example_dct_plan_and_execute(int n, int howmany) {
+  armral_dct_plan_t *p;
+  printf("Planning DCT of length = %d, howmany = %d\n", n, howmany);
+
+  // Layout is specifified by stride and dist parameters. Interleaved layout
+  // means:
+  //   istride == ostride == howmany
+  //   idist == odist == 1
+  armral_dct_create_batch_plan_f16(&p, n, howmany, howmany, 1, howmany, 1);
+
+  // Initialise input array, and allocate output array (transforms may be
+  // in-place, but this example does an out-of-place transform).
+  armral_half *x = (armral_half *)malloc(n * howmany * sizeof(armral_half));
+  armral_half *y = (armral_half *)malloc(n * howmany * sizeof(armral_half));
+  for (int i = 0; i < n; i++) {
+    for (int k = 0; k < howmany; k++) {
+      x[i * howmany + k] = (armral_half)((float)rand() / RAND_MAX);
+    }
+  }
+
+  puts("Input data (interleaved transform operates on columns in the below "
+       "table):");
+
+  printf("     ");
+  for (int k = 0; k < howmany; k++) {
+    printf(" k=%-2d", k);
+  }
+  puts("");
+
+  for (int i = 0; i < n; i++) {
+    printf("i=%-2d ", i);
+    for (int k = 0; k < howmany; k++) {
+      printf("%.2f ", (float)x[i * howmany + k]);
+    }
+    puts("");
+  }
+
+  // Execute the batch of transforms. Input and output arrays need to have
+  // enough space allocated to support the specified layout. This is not checked
+  // by the library
+  printf("Performing batch of %d DCTs of length %d\n", howmany, n);
+  armral_dct_execute_f16(p, x, y);
+
+  // A plan can be re-used to solve other DCTs, but once it is no longer needed
+  // it should to be destroyed to avoid leaking memory.
+  puts("Destroying plan");
+  armral_dct_destroy_plan_f16(&p);
+
+  puts("Output data:");
+
+  printf("      ");
+  for (int k = 0; k < howmany; k++) {
+    printf(" k=%-3d", k);
+  }
+  puts("");
+
+  for (int i = 0; i < n; i++) {
+    printf("i=%-2d ", i);
+    for (int k = 0; k < howmany; k++) {
+      printf("%5.2f ", (float)y[i * howmany + k]);
+    }
+    puts("");
+  }
+
+  // Free memory allocated for input and output. This is the responsibility of
+  // the user, not the plan.
+  free(x);
+  free(y);
+}
+
+int main(int argc, char **argv) {
+  if (argc < 2) {
+    printf("Usage: %s n howmany\n", argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  int n = atoi(argv[1]);
+  if (n < 1) {
+    printf("Length parameter must be positive and non-zero\n");
+    exit(EXIT_FAILURE);
+  }
+
+  int howmany = atoi(argv[2]);
+  if (howmany < 1) {
+    printf("Batch size parameter must be positive and non-zero\n");
+    exit(EXIT_FAILURE);
+  }
+
+  example_dct_plan_and_execute(n, howmany);
+}
diff --git a/include/armral.h b/include/armral.h
index 70d795fe65996f2e435f9fefb699d486ea0ede7e..f05fea3f7599932e6483e9709ec008b98ec78a45 100644
--- a/include/armral.h
+++ b/include/armral.h
@@ -108,6 +108,7 @@ typedef enum {
   ARMRAL_SUCCESS = 0,         ///< No error.
   ARMRAL_ARGUMENT_ERROR = -1, ///< One or more arguments are incorrect.
   ARMRAL_FAIL = -2,           ///< A failure has been detected.
+  ARMRAL_UNSUPPORTED = -3     ///< The requested operation is not supported.
 } armral_status;
 
 /**
@@ -200,6 +201,12 @@ typedef struct {
   float32_t im; ///< 32-bit imaginary component.
 } armral_cmplx_f32_t;
 
+#ifndef __clang__
+typedef float armral_half __attribute__((__mode__(HF)));
+#else
+typedef _Float16 armral_half;
+#endif
+
 /**
  * The number of complex samples in each compressed block.
  */
@@ -3381,6 +3388,82 @@ armral_status armral_fft_execute_cs16(const armral_fft_plan_t *p,
  */
 armral_status armral_fft_destroy_plan_cs16(armral_fft_plan_t **p);
 
+/**
+ * The opaque structure representing a DCT plan. DCT plan must be initialised
+ * with \ref armral_dct_create_batch_plan_f16.
+ */
+typedef struct armral_dct_plan_t armral_dct_plan_t;
+
+/**
+ * @brief Creates a DCT plan.
+ *
+ * Initialises a DCT plan and sets the out-parameter `p` to point to the created
+ * plan. It is efficient to create plans once and reuse them, rather than
+ * creating a plan for every execute call.
+ *
+ * Memory layout of the input and output arrays is specified by the stride and
+ * dist parameters.
+ *
+ * To avoid memory leaks, call \ref armral_dct_destroy_plan_f16 when you no
+ * longer need this plan.
+ *
+ * @param[in,out] p A pointer to the DCT plan pointer.
+ * @param[in] n       The size of DCT to be computed by this plan.
+ * @param[in] howmany The number of transforms to compute.
+ * @param[in] istride The distance between adjacent elements of the input
+ *                    sequence to the transform. For example istride of 1 means
+ *                    that the transform is done on a contiguous block of
+ *                    memory. istride of 2 means that the input sequence is
+ *                    every other element of `in`.
+ * @param[in] idist   The distance between the i-th element of each input
+ *                    sequence. For instance if `istride=1` and `idist=n`, then
+ *                    each input sequence is stored contiguously in memory with
+ *                    no gaps between. If `istride=howmany` and `idist=1` then
+ *                    the input sequences are stored with perfect interleaving.
+ * @param[in] ostride The distance in memory between adjacent elements of the
+ *                    output sequence.
+ * @param[in] odist   The distance in memory between the i-th element of each
+ *                    output sequence.
+ * @return An `armral_status` value that indicates success or failure.
+ */
+armral_status armral_dct_create_batch_plan_f16(armral_dct_plan_t **p, int n,
+                                               int howmany, int istride,
+                                               int idist, int ostride,
+                                               int odist);
+
+/**
+ * @brief Executes a DCT plan.
+ *
+ * Uses the plan plan created by \ref armral_dct_create_batch_plan_f16 to
+ * perform the configured DCT with the specified arrays.
+ *
+ * @param[in] p    The plan to execute.
+ * @param[in] in   The input array.
+ * @param[out] out The output array.
+ * @return An `armral_status` value that indicates success or failure.
+ */
+armral_status armral_dct_execute_f16(const armral_dct_plan_t *p,
+                                     const armral_half *in, armral_half *out);
+
+/**
+ * @brief Destroys a DCT plan.
+ *
+ * Deallocates the plan and all resources used by it. The plan can no longer be
+ * used after destruction. On exit, `*p` is set to NULL.
+ *
+ * @param[in, out] p The plan to destroy.
+ * @return An `armral_status` value that indicates success or failure.
+ */
+armral_status armral_dct_destroy_plan_f16(armral_dct_plan_t **p);
+
+/**
+ * @brief Prints some debug information about the structure of a DCT plan.
+ *
+ * @param[in] p The plan to inspect.
+ * @return An `armral_status` value that indicates success or failure.
+ */
+armral_status armral_dct_print_plan_f16(armral_dct_plan_t *p);
+
 /** @} end fft */
 
 /**
diff --git a/src/LowerPHY/DCT/dct_interface.cpp b/src/LowerPHY/DCT/dct_interface.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e6873e64a5ba8dca864a61612dfecf37d0811643
--- /dev/null
+++ b/src/LowerPHY/DCT/dct_interface.cpp
@@ -0,0 +1,47 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: <text>Copyright 2025 Arm Limited and/or its
+    affiliates <open-source-office@arm.com></text>
+    SPDX-License-Identifier: BSD-3-Clause
+*/
+#include "armral.h"
+#include "dct_plan.hpp"
+#include <cstdlib>
+extern "C" {
+
+armral_status armral_dct_create_batch_plan_f16(armral_dct_plan_t **p, int n,
+                                               int howmany, int istride,
+                                               int idist, int ostride,
+                                               int odist) {
+  auto plan =
+      armral::dct::make_dct_plan(n, howmany, istride, idist, ostride, odist);
+  /* make_dct_plan returns unique_ptr. Null ptr means the parameter set is not
+     supported. User's dct plan handle has a raw pointer in - use release to
+     stop default destruction on return.  */
+  if (plan) {
+    *p = plan.release();
+    return ARMRAL_SUCCESS;
+  }
+  return ARMRAL_UNSUPPORTED;
+}
+
+armral_status armral_dct_execute_f16(const armral_dct_plan_t *p,
+                                     const armral_half *in, armral_half *out) {
+  return p->execute(in, out);
+}
+
+armral_status armral_dct_destroy_plan_f16(armral_dct_plan_t **p) {
+  /* Plan was allocated using make_unique, but the user has a raw pointer not a
+     unique_ptr - have to call destructor followed by free manually in order to
+     deallocate.  */
+  (*p)->~armral_dct_plan_t();
+  free(*p);
+  *p = NULL;
+  return ARMRAL_SUCCESS;
+}
+
+armral_status armral_dct_print_plan_f16(armral_dct_plan_t *p) {
+  p->print_plan();
+  return ARMRAL_SUCCESS;
+}
+}
diff --git a/src/LowerPHY/DCT/dct_plan.cpp b/src/LowerPHY/DCT/dct_plan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..123afeea6156ff2e8cd0443af9ca227f1716c047
--- /dev/null
+++ b/src/LowerPHY/DCT/dct_plan.cpp
@@ -0,0 +1,89 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: <text>Copyright 2025 Arm Limited and/or its
+    affiliates <open-source-office@arm.com></text>
+    SPDX-License-Identifier: BSD-3-Clause
+*/
+#include "dct_plan.hpp"
+#include "direct_plan.hpp"
+#include "odd_radix_plan.hpp"
+#include "radix_2_plan.hpp"
+
+namespace {
+bool is_supported_by_mixed_radix_plan(int n) {
+  switch (n) {
+  case 1:
+    return false;
+  case 2:
+  case 3:
+  case 5:
+  case 7:
+    return true;
+  default:
+    if (n % 2 == 0) {
+      return is_supported_by_mixed_radix_plan(n / 2);
+    }
+    if (n % 3 == 0) {
+      return is_supported_by_mixed_radix_plan(n / 3);
+    }
+    if (n % 5 == 0) {
+      return is_supported_by_mixed_radix_plan(n / 5);
+    }
+    if (n % 7 == 0) {
+      return is_supported_by_mixed_radix_plan(n / 7);
+    }
+  }
+  return false;
+}
+} // namespace
+
+namespace armral::dct {
+
+utils::unique_ptr<armral_dct_plan_t> make_mixed_radix_plan(int n, int howmany) {
+  switch (n) {
+  case 2:
+    return create_direct_plan<2>(howmany);
+  case 3:
+    return create_direct_plan<3>(howmany);
+  case 4:
+    return create_direct_plan<4>(howmany);
+  case 5:
+    return create_direct_plan<5>(howmany);
+  case 7:
+    return create_direct_plan<7>(howmany);
+  case 8:
+    return create_direct_plan<8>(howmany);
+  case 9:
+    return create_direct_plan<9>(howmany);
+  default:
+    if (n % 2 == 0) {
+      return create_radix_2_plan(n, howmany);
+    }
+    if (n % 3 == 0) {
+      return create_odd_radix_plan<3>(n, howmany);
+    }
+    if (n % 5 == 0) {
+      return create_odd_radix_plan<5>(n, howmany);
+    }
+    if (n % 7 == 0) {
+      return create_odd_radix_plan<7>(n, howmany);
+    }
+    // Unsupported configuration
+    __builtin_unreachable();
+  }
+}
+
+utils::unique_ptr<armral_dct_plan_t> make_dct_plan(int n, int howmany,
+                                                   int istride, int idist,
+                                                   int ostride, int odist) {
+  /* Optimised path only supports interleaved format.  */
+  if (istride == howmany && ostride == howmany && odist == 1 && idist == 1 &&
+      is_supported_by_mixed_radix_plan(n)) {
+    return make_mixed_radix_plan(n, howmany);
+  }
+
+  /* No other configuration is currently supported.  */
+  return utils::unique_ptr<armral_dct_plan_t>(nullptr);
+}
+
+} // namespace armral::dct
diff --git a/src/LowerPHY/DCT/dct_plan.hpp b/src/LowerPHY/DCT/dct_plan.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e5adc6acc6987abadf6e307e30d68cd063fbee26
--- /dev/null
+++ b/src/LowerPHY/DCT/dct_plan.hpp
@@ -0,0 +1,50 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: <text>Copyright 2025 Arm Limited and/or its
+    affiliates <open-source-office@arm.com></text>
+    SPDX-License-Identifier: BSD-3-Clause
+*/
+#pragma once
+
+#include "armral.h"
+#include "unique_ptr.hpp"
+
+#include <cassert>
+
+struct armral_dct_plan_t {
+public:
+  virtual ~armral_dct_plan_t() = default;
+  armral_dct_plan_t() = default;
+  armral_dct_plan_t(const armral_dct_plan_t &) = delete;
+  armral_dct_plan_t &operator=(const armral_dct_plan_t &) = delete;
+
+  /* Cannot declare these pure virtual as this introduces dependency on
+     libstdc++ - instead define them as stubs.  */
+  virtual armral_status execute(const armral_half * /*unused*/,
+                                armral_half * /*unused*/) const {
+    assert(false);
+    __builtin_unreachable();
+  };
+
+  virtual void print_plan() const {
+    assert(false);
+  };
+
+  /* Virtual destructor may result in calls to operator delete (note the calls
+     are not in the destructor, but emitted in case delete is ever called on
+     armral_dct_plan_t. Define operator delete here to avoid dependency on
+     libstdc++, but don't even think about using it! Use utils::make_unique for
+     dynamically allocated DCT plan.  */
+  void operator delete(void * /*unused*/) {
+    assert(false);
+  };
+};
+
+namespace armral::dct {
+
+utils::unique_ptr<armral_dct_plan_t> make_mixed_radix_plan(int n, int howmany);
+
+utils::unique_ptr<armral_dct_plan_t> make_dct_plan(int n, int howmany,
+                                                   int istride, int idist,
+                                                   int ostride, int odist);
+} // namespace armral::dct
diff --git a/src/LowerPHY/DCT/direct_2_point_plan.cpp b/src/LowerPHY/DCT/direct_2_point_plan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ee0c95e90b147f95add8b26c2a8fa241a44b0296
--- /dev/null
+++ b/src/LowerPHY/DCT/direct_2_point_plan.cpp
@@ -0,0 +1,33 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: <text>Copyright 2025 Arm Limited and/or its
+    affiliates <open-source-office@arm.com></text>
+    SPDX-License-Identifier: BSD-3-Clause
+*/
+#include "direct_plan.hpp"
+#include "dynamic_value.hpp"
+#include "pragmas.hpp"
+
+namespace armral::dct {
+
+template<class Howmany>
+struct direct_plan_worker<2, Howmany> {
+  static armral_status execute(const armral_half *in, armral_half *out,
+                               Howmany howmany) {
+    constexpr auto rt2 = static_cast<armral_half>(0x1.6a09e667f3bcdp0);
+    ARMRAL_PRAGMA_AUTOVEC
+    for (int k = 0; k < howmany.v; k++) {
+      const auto x0 = in[k];
+      const auto x1 = in[howmany.v + k];
+      out[k] = 2 * (x0 + x1);
+      out[howmany.v + k] = rt2 * (x0 - x1);
+    }
+    return ARMRAL_SUCCESS;
+  };
+};
+
+template<>
+utils::unique_ptr<armral_dct_plan_t> create_direct_plan<2>(int howmany) {
+  return utils::make_unique<direct_plan<2, dynamic_value>>(howmany);
+}
+} // namespace armral::dct
diff --git a/src/LowerPHY/DCT/direct_3_point_plan.cpp b/src/LowerPHY/DCT/direct_3_point_plan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1fa32aa010ac07ecd06c0eec8bba087bff229f23
--- /dev/null
+++ b/src/LowerPHY/DCT/direct_3_point_plan.cpp
@@ -0,0 +1,41 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: <text>Copyright 2025 Arm Limited and/or its
+    affiliates <open-source-office@arm.com></text>
+    SPDX-License-Identifier: BSD-3-Clause
+*/
+#include "direct_plan.hpp"
+#include "dynamic_value.hpp"
+#include "pragmas.hpp"
+
+namespace armral::dct {
+
+template<class Howmany>
+struct direct_plan_worker<3, Howmany> {
+  static armral_status execute(const armral_half *in,
+                               armral_half *out, // NOLINT (LLVM bug 41393)
+                               Howmany howmany) {
+    // 2 * cos(pi / 6)
+    constexpr auto a = static_cast<armral_half>(0x1.bb67ae8584caap0);
+
+    ARMRAL_PRAGMA_AUTOVEC
+    for (int k = 0; k < howmany.v; k++) {
+      const auto x0 = in[howmany.v * 0 + k];
+      const auto x1 = in[howmany.v * 1 + k] * 2;
+      const auto x2 = in[howmany.v * 2 + k];
+      const auto p02 = x0 + x2;
+      out[howmany.v * 0 + k] = p02 * 2 + x1;
+      out[howmany.v * 1 + k] = (x0 - x2) * a;
+      out[howmany.v * 2 + k] = p02 - x1;
+    }
+
+    return ARMRAL_SUCCESS;
+  };
+};
+
+template<>
+utils::unique_ptr<armral_dct_plan_t> create_direct_plan<3>(int howmany) {
+  return utils::make_unique<direct_plan<3, dynamic_value>>(howmany);
+}
+
+} // namespace armral::dct
diff --git a/src/LowerPHY/DCT/direct_4_point_plan.cpp b/src/LowerPHY/DCT/direct_4_point_plan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2c18d38d745876ecc2f76185671d3b59756eaaaf
--- /dev/null
+++ b/src/LowerPHY/DCT/direct_4_point_plan.cpp
@@ -0,0 +1,57 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: <text>Copyright 2025 Arm Limited and/or its
+    affiliates <open-source-office@arm.com></text>
+    SPDX-License-Identifier: BSD-3-Clause
+*/
+#include "direct_plan.hpp"
+#include "dynamic_value.hpp"
+#include "pragmas.hpp"
+
+namespace armral::dct {
+
+template<class Howmany>
+struct direct_plan_worker<4, Howmany> {
+  static armral_status execute(const armral_half *in,
+                               armral_half *out, // NOLINT (LLVM bug 41393)
+                               Howmany howmany) {
+    /* 4-point DCT using Loeffler's algorithm:
+       C. Loeffler, A. Ligtenberg and G. S. Moschytz, "Practical fast 1-D DCT
+       algorithms with 11 multiplications," International Conference on
+       Acoustics, Speech, and Signal Processing,, Glasgow, UK, 1989, pp. 988-991
+       vol.2.  */
+
+    constexpr auto rt2 = static_cast<armral_half>(0x1.6a09e667f3bcdp0);
+    // 2 * cos(1 * pi / 8)
+    constexpr auto a = static_cast<armral_half>(0x1.d906bcf328d46p0);
+    // 2 * cos(3 * pi / 8)
+    constexpr auto b = static_cast<armral_half>(0x1.87de2a6aea963p-1);
+    // 2 * cos(9 * pi / 8)
+    constexpr auto c = static_cast<armral_half>(-0x1.d906bcf328d46p0);
+
+    ARMRAL_PRAGMA_AUTOVEC
+    for (int k = 0; k < howmany.v; k++) {
+      const auto x0 = in[howmany.v * 0 + k];
+      const auto x1 = in[howmany.v * 1 + k];
+      const auto x2 = in[howmany.v * 2 + k];
+      const auto x3 = in[howmany.v * 3 + k];
+      const auto y0 = x0 + x3;
+      const auto y1 = x1 + x2;
+      const auto y2 = x0 - x3;
+      const auto y3 = x1 - x2;
+      out[howmany.v * 0 + k] = 2 * (y0 + y1);
+      out[howmany.v * 1 + k] = a * y2 + b * y3;
+      out[howmany.v * 2 + k] = rt2 * (y0 - y1);
+      out[howmany.v * 3 + k] = b * y2 + c * y3;
+    }
+
+    return ARMRAL_SUCCESS;
+  };
+};
+
+template<>
+utils::unique_ptr<armral_dct_plan_t> create_direct_plan<4>(int howmany) {
+  return utils::make_unique<direct_plan<4, dynamic_value>>(howmany);
+}
+
+} // namespace armral::dct
diff --git a/src/LowerPHY/DCT/direct_5_point_plan.cpp b/src/LowerPHY/DCT/direct_5_point_plan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f2e45a1990674b369bc063f386d8d38236550c24
--- /dev/null
+++ b/src/LowerPHY/DCT/direct_5_point_plan.cpp
@@ -0,0 +1,70 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: <text>Copyright 2025 Arm Limited and/or its
+    affiliates <open-source-office@arm.com></text>
+    SPDX-License-Identifier: BSD-3-Clause
+*/
+#include "direct_plan.hpp"
+#include "dynamic_value.hpp"
+#include "pragmas.hpp"
+
+namespace armral::dct {
+
+template<class Howmany>
+struct direct_plan_worker<5, Howmany> {
+  static armral_status execute(const armral_half *in,
+                               armral_half *out, // NOLINT (LLVM bug 41393)
+                               Howmany howmany) {
+    /* 5-point algorithm from appendix B of:
+       G. Bi and L. W. Yu, "DCT algorithms for composite sequence lengths," in
+       IEEE Transactions on Signal Processing, vol. 46, no. 3, pp. 554-562,
+       March 1998.  */
+
+    // (cos(pi / 5) - cos(3 * pi / 5)) / 2
+    constexpr auto a = static_cast<armral_half>(0x1.1e3779b97f4a8p-1);
+    // cos(pi / 10)
+    constexpr auto b = static_cast<armral_half>(0x1.e6f0e134454ffp-1);
+    // cos(pi / 10) - cos(3 * pi / 10)
+    constexpr auto c = static_cast<armral_half>(0x1.73fd61d9df543p-2);
+    // cos(pi / 10) + cos(3 * pi / 10)
+    constexpr auto d = static_cast<armral_half>(0x1.89f188bdcd7afp0);
+
+    ARMRAL_PRAGMA_AUTOVEC
+    for (int k = 0; k < howmany.v; k++) {
+      const auto x0 = in[howmany.v * 0 + k];
+      const auto x1 = in[howmany.v * 1 + k];
+      const auto x2 = in[howmany.v * 2 + k];
+      const auto x3 = in[howmany.v * 3 + k];
+      const auto x4 = in[howmany.v * 4 + k];
+
+      const auto u0 = x0 + x4;
+      const auto u1 = x1 + x3;
+      const auto v0 = x0 - x4;
+      const auto v1 = x1 - x3;
+      auto t1 = u0 + u1;
+      auto t2 = u0 - u1;
+      auto t3 = (t1 * static_cast<armral_half>(0.25)) - x2;
+      const auto t4 = t2 * a;
+
+      out[howmany.v * 0 + k] = (t1 + x2) * 2;
+      out[howmany.v * 2 + k] = (t3 + t4) * 2;
+      out[howmany.v * 4 + k] = (t4 - t3) * 2;
+
+      t1 = (v0 + v1) * b;
+      t2 = v1 * c;
+      t3 = v0 * d;
+
+      out[howmany.v * 1 + k] = (t1 - t2) * 2;
+      out[howmany.v * 3 + k] = (t3 - t1) * 2;
+    }
+
+    return ARMRAL_SUCCESS;
+  };
+};
+
+template<>
+utils::unique_ptr<armral_dct_plan_t> create_direct_plan<5>(int howmany) {
+  return utils::make_unique<direct_plan<5, dynamic_value>>(howmany);
+}
+
+} // namespace armral::dct
diff --git a/src/LowerPHY/DCT/direct_7_point_plan.cpp b/src/LowerPHY/DCT/direct_7_point_plan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d04661ed0eea3e9363a76a14d498488bd3206c16
--- /dev/null
+++ b/src/LowerPHY/DCT/direct_7_point_plan.cpp
@@ -0,0 +1,61 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: <text>Copyright 2025 Arm Limited and/or its
+    affiliates <open-source-office@arm.com></text>
+    SPDX-License-Identifier: BSD-3-Clause
+*/
+#include "direct_plan.hpp"
+#include "dynamic_value.hpp"
+#include "pragmas.hpp"
+
+namespace armral::dct {
+
+template<class Howmany>
+struct direct_plan_worker<7, Howmany> {
+  static armral_status execute(const armral_half *in,
+                               armral_half *out, // NOLINT (LLVM bug 41393)
+                               Howmany howmany) {
+    // ci = 2 * cos(i * pi / 14)
+    constexpr auto c1 = static_cast<armral_half>(0x1.f329c0558e969p0);
+    constexpr auto c2 = static_cast<armral_half>(0x1.cd4bca9cb5c71p0);
+    constexpr auto c3 = static_cast<armral_half>(0x1.904c37505de4bp0);
+    constexpr auto c4 = static_cast<armral_half>(0x1.3f3a0e28bedd1p0);
+    constexpr auto c5 = static_cast<armral_half>(0x1.bc4c04d71abc1p-1);
+    constexpr auto c6 = static_cast<armral_half>(0x1.c7b90e3024582p-2);
+
+    ARMRAL_PRAGMA_AUTOVEC
+    for (int k = 0; k < howmany.v; k++) {
+      const auto x0 = in[0 * howmany.v + k];
+      const auto x1 = in[1 * howmany.v + k];
+      const auto x2 = in[2 * howmany.v + k];
+      const auto x3 = in[3 * howmany.v + k] * 2;
+      const auto x4 = in[4 * howmany.v + k];
+      const auto x5 = in[5 * howmany.v + k];
+      const auto x6 = in[6 * howmany.v + k];
+
+      const auto p06 = x0 + x6;
+      const auto p15 = x1 + x5;
+      const auto p24 = x2 + x4;
+      const auto s06 = x0 - x6;
+      const auto s15 = x1 - x5;
+      const auto s24 = x2 - x4;
+
+      out[0 * howmany.v + k] = 2 * (p06 + p15 + p24) + x3;
+      out[1 * howmany.v + k] = c1 * s06 + c3 * s15 + c5 * s24;
+      out[2 * howmany.v + k] = c2 * p06 - c4 * p24 + c6 * p15 - x3;
+      out[3 * howmany.v + k] = c3 * s06 - c1 * s24 - c5 * s15;
+      out[4 * howmany.v + k] = c4 * p06 - c2 * p15 - c6 * p24 + x3;
+      out[5 * howmany.v + k] = c3 * s24 + c5 * s06 - c1 * s15;
+      out[6 * howmany.v + k] = c2 * p24 - c4 * p15 + c6 * p06 - x3;
+    }
+
+    return ARMRAL_SUCCESS;
+  };
+};
+
+template<>
+utils::unique_ptr<armral_dct_plan_t> create_direct_plan<7>(int howmany) {
+  return utils::make_unique<direct_plan<7, dynamic_value>>(howmany);
+}
+
+} // namespace armral::dct
diff --git a/src/LowerPHY/DCT/direct_8_point_plan.cpp b/src/LowerPHY/DCT/direct_8_point_plan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..964f3ecc9dfae0b37fdb2fb69f7bb2c3f18d2a1b
--- /dev/null
+++ b/src/LowerPHY/DCT/direct_8_point_plan.cpp
@@ -0,0 +1,109 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: <text>Copyright 2025 Arm Limited and/or its
+    affiliates <open-source-office@arm.com></text>
+    SPDX-License-Identifier: BSD-3-Clause
+*/
+#include "direct_plan.hpp"
+#include "dynamic_value.hpp"
+#include "pragmas.hpp"
+#include "static_value.hpp"
+
+namespace armral::dct {
+
+template<class Howmany>
+struct direct_plan_worker<8, Howmany> {
+  static armral_status execute(const armral_half *in,
+                               armral_half *out, // NOLINT (LLVM bug 41393)
+                               Howmany howmany) {
+    /* 8-point DCT using Loeffler's algorithm:
+       C. Loeffler, A. Ligtenberg and G. S. Moschytz, "Practical fast 1-D DCT
+       algorithms with 11 multiplications," International Conference on
+       Acoustics, Speech, and Signal Processing,, Glasgow, UK, 1989, pp. 988-991
+       vol.2.  */
+
+    // 2 * sin(1 * pi / 16)
+    constexpr auto s1 = static_cast<armral_half>(0x1.8f8b83c69a60bp-2);
+    // 2 * cos(1 * pi / 16)
+    constexpr auto c1 = static_cast<armral_half>(0x1.f6297cff75cbp0);
+    // 2 * sin(3 * pi / 16)
+    constexpr auto s3 = static_cast<armral_half>(0x1.1c73b39ae68c8p0);
+    // 2 * cos(3 * pi / 16)
+    constexpr auto c3 = static_cast<armral_half>(0x1.a9b66290ea1a3p0);
+    // 2 * sin(6 * pi / 16)
+    constexpr auto s6 = static_cast<armral_half>(0x1.d906bcf328d46p0);
+    // 2 * cos(6 * pi / 16)
+    constexpr auto c6 = static_cast<armral_half>(0x1.87de2a6aea963p-1);
+    // sqrt(2)
+    constexpr auto rt2 = static_cast<armral_half>(0x1.6a09e667f3bcdp0);
+    // 1 / sqrt(2)
+    constexpr auto irt2 = static_cast<armral_half>(0x1.6a09e667f3bcdp-1);
+
+    ARMRAL_PRAGMA_AUTOVEC
+    for (int k = 0; k < howmany.v; k++) {
+      const auto x_0 = in[howmany.v * 0 + k];
+      const auto x_1 = in[howmany.v * 1 + k];
+      const auto x_2 = in[howmany.v * 2 + k];
+      const auto x_3 = in[howmany.v * 3 + k];
+      const auto x_4 = in[howmany.v * 4 + k];
+      const auto x_5 = in[howmany.v * 5 + k];
+      const auto x_6 = in[howmany.v * 6 + k];
+      const auto x_7 = in[howmany.v * 7 + k];
+
+      // Stage 1
+      const auto x1_0 = x_0 + x_7;
+      const auto x1_1 = x_1 + x_6;
+      const auto x1_2 = x_2 + x_5;
+      const auto x1_3 = x_3 + x_4;
+      const auto x1_4 = x_3 - x_4;
+      const auto x1_5 = x_2 - x_5;
+      const auto x1_6 = x_1 - x_6;
+      const auto x1_7 = x_0 - x_7;
+
+      // Stage 2
+      const auto x2_0 = x1_0 + x1_3;
+      const auto x2_1 = x1_1 + x1_2;
+      const auto x2_2 = x1_1 - x1_2;
+      const auto x2_3 = x1_0 - x1_3;
+      const auto x2_4 = c3 * x1_4 + s3 * x1_7;
+      const auto x2_5 = c1 * x1_5 + s1 * x1_6;
+      const auto x2_6 = c1 * x1_6 - s1 * x1_5;
+      const auto x2_7 = c3 * x1_7 - s3 * x1_4;
+
+      // Stage 3
+      const auto x3_0 = x2_0 + x2_1;
+      const auto x3_1 = x2_0 - x2_1;
+      const auto x3_2 = c6 * x2_2 + s6 * x2_3;
+      const auto x3_3 = c6 * x2_3 - s6 * x2_2;
+      const auto x3_4 = (x2_4 + x2_6) * irt2;
+      const auto x3_5 = x2_7 - x2_5;
+      const auto x3_6 = x2_4 - x2_6;
+      const auto x3_7 = (x2_7 + x2_5) * irt2;
+
+      out[k + howmany.v * 0] = x3_0 * 2;
+      out[k + howmany.v * 2] = x3_2;
+      out[k + howmany.v * 4] = x3_1 * rt2;
+      out[k + howmany.v * 6] = x3_3;
+
+      // Stage 4
+      out[k + howmany.v * 1] = x3_7 + x3_4;
+      out[k + howmany.v * 3] = x3_5;
+      out[k + howmany.v * 5] = x3_6;
+      out[k + howmany.v * 7] = x3_7 - x3_4;
+    }
+
+    return ARMRAL_SUCCESS;
+  };
+};
+
+template<>
+utils::unique_ptr<armral_dct_plan_t> create_direct_plan<8>(int howmany) {
+  switch (howmany) {
+  case 1:
+    return utils::make_unique<direct_plan<8, static_value<1>>>(howmany);
+  default:
+    return utils::make_unique<direct_plan<8, dynamic_value>>(howmany);
+  }
+}
+
+} // namespace armral::dct
diff --git a/src/LowerPHY/DCT/direct_9_point_plan.cpp b/src/LowerPHY/DCT/direct_9_point_plan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c8d3f2907f70ae9f154601c200e4214f0673589a
--- /dev/null
+++ b/src/LowerPHY/DCT/direct_9_point_plan.cpp
@@ -0,0 +1,98 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: <text>Copyright 2025 Arm Limited and/or its
+    affiliates <open-source-office@arm.com></text>
+    SPDX-License-Identifier: BSD-3-Clause
+*/
+#include "direct_plan.hpp"
+#include "dynamic_value.hpp"
+#include "pragmas.hpp"
+
+namespace armral::dct {
+
+template<class Howmany>
+struct direct_plan_worker<9, Howmany> {
+  static armral_status execute(const armral_half *in,
+                               armral_half *out, // NOLINT (LLVM bug 41393)
+                               Howmany howmany) {
+    /* 5-point algorithm from appendix B of:
+       G. Bi and L. W. Yu, "DCT algorithms for composite sequence lengths," in
+       IEEE Transactions on Signal Processing, vol. 46, no. 3, pp. 554-562,
+       March 1998, with some modification for the extra factor of 2 in the
+       output.  */
+
+    // 2 * sin(1 * pi / 9)
+    constexpr auto s1 = static_cast<armral_half>(0x1.5e3a8748a0bf5p-1);
+    // 2 * sin(2 * pi / 9)
+    constexpr auto s2 = static_cast<armral_half>(0x1.491b7523c161dp0);
+    // 2 * sin(3 * pi /9)
+    constexpr auto s3 = static_cast<armral_half>(0x1.bb67ae8584caap0);
+    // 2 * cos(1 * pi / 9)
+    constexpr auto c1 = static_cast<armral_half>(0x1.e11f642522d1cp0);
+    // 2 * cos(2 * pi / 9)
+    constexpr auto c2 = static_cast<armral_half>(0x1.8836fa2cf5039p0);
+    // 2 * cos(0.5 * pi / 9)
+    constexpr auto ch = static_cast<armral_half>(0x1.f838b8c811c17p0);
+    // 2 * sin(0.5 * pi / 9)
+    constexpr auto sh = static_cast<armral_half>(0x1.63a1a7e0b738ap-2);
+
+    ARMRAL_PRAGMA_AUTOVEC
+    for (int k = 0; k < howmany.v; k++) {
+      const auto x0 = in[howmany.v * 0 + k];
+      const auto x1 = in[howmany.v * 1 + k];
+      const auto x2 = in[howmany.v * 2 + k];
+      const auto x3 = in[howmany.v * 3 + k];
+      const auto x4 = in[howmany.v * 4 + k] * 2;
+      const auto x5 = in[howmany.v * 5 + k];
+      const auto x6 = in[howmany.v * 6 + k];
+      const auto x7 = in[howmany.v * 7 + k];
+      const auto x8 = in[howmany.v * 8 + k];
+
+      const auto u0 = x0 + x8;
+      const auto u1 = x1 + x7;
+      const auto u2 = x2 + x6;
+      const auto u3 = x3 + x5;
+      const auto v0 = x0 - x8;
+      const auto v1 = x1 - x7;
+      const auto v2 = x2 - x6;
+      const auto v3 = x3 - x5;
+
+      const auto t0 = u0 + u2 + u3;
+      const auto t2 = u0 - u2;
+      const auto t3 = u2 - u3;
+      const auto t1 = (t2 + t3) * c2;
+      const auto t4 = u1 - x4;
+
+      const auto t6 = t2 * sh;
+      const auto t7 = t3 * c1;
+      const auto t8 = 2 * u1 + x4;
+      const auto t9 = t1 + t6;
+      const auto t10 = t1 - t7;
+
+      const auto t11 = v1 * s3;
+      const auto t12 = (v0 + v3) * ch;
+      const auto t13 = (v2 - v3) * s2;
+      const auto t14 = (v0 + v2) * s1;
+      const auto t15 = t12 + t13;
+      const auto t16 = t12 - t14;
+
+      out[k + howmany.v * 0] = 2 * t0 + t8;
+      out[k + howmany.v * 1] = t15 + t11;
+      out[k + howmany.v * 2] = t9 + t4;
+      out[k + howmany.v * 3] = (v0 - v2 - v3) * s3;
+      out[k + howmany.v * 4] = t10 - t4;
+      out[k + howmany.v * 5] = t16 - t11;
+      out[k + howmany.v * 6] = t0 - t8;
+      out[k + howmany.v * 7] = t15 - t16 - t11;
+      out[k + howmany.v * 8] = t9 - t10 - t4;
+    }
+
+    return ARMRAL_SUCCESS;
+  };
+};
+
+template<>
+utils::unique_ptr<armral_dct_plan_t> create_direct_plan<9>(int howmany) {
+  return utils::make_unique<direct_plan<9, dynamic_value>>(howmany);
+}
+} // namespace armral::dct
diff --git a/src/LowerPHY/DCT/direct_plan.hpp b/src/LowerPHY/DCT/direct_plan.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..690c41eee886f023dc356b8e07fc4122ec7bdd74
--- /dev/null
+++ b/src/LowerPHY/DCT/direct_plan.hpp
@@ -0,0 +1,38 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: <text>Copyright 2025 Arm Limited and/or its
+    affiliates <open-source-office@arm.com></text>
+    SPDX-License-Identifier: BSD-3-Clause
+*/
+#pragma once
+
+#include "dct_plan.hpp"
+
+#include <cstdio>
+
+namespace armral::dct {
+
+template<int N, class Howmany>
+struct direct_plan_worker;
+
+template<int N, class Howmany>
+class direct_plan : public armral_dct_plan_t {
+public:
+  direct_plan(int howmany) : m_howmany(howmany) {}
+
+  armral_status execute(const armral_half *in, armral_half *out) const final {
+    return direct_plan_worker<N, Howmany>::execute(in, out, m_howmany);
+  };
+
+  void print_plan() const final {
+    printf("direct_%d_point_plan(howmany=%d)\n", N, m_howmany.v);
+  }
+
+private:
+  const Howmany m_howmany;
+};
+
+template<int N>
+utils::unique_ptr<armral_dct_plan_t> create_direct_plan(int howmany);
+
+} // namespace armral::dct
diff --git a/src/LowerPHY/DCT/dynamic_value.hpp b/src/LowerPHY/DCT/dynamic_value.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f7794f97406f9696d0b9b428ad253aa741b834ca
--- /dev/null
+++ b/src/LowerPHY/DCT/dynamic_value.hpp
@@ -0,0 +1,18 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: <text>Copyright 2025 Arm Limited and/or its
+    affiliates <open-source-office@arm.com></text>
+    SPDX-License-Identifier: BSD-3-Clause
+*/
+#pragma once
+
+class dynamic_value {
+  /* Utility class to allow DCT stages to provide specialisations for certain
+     values of some parameter (via the partner class static_value) while still
+     allowing arbitrary values to be chosen at runtime.  */
+public:
+  // NOLINTNEXTLINE(readability-identifier-naming)
+  dynamic_value(int v_) : v(v_) {}
+
+  const int v;
+};
diff --git a/src/LowerPHY/DCT/odd_radix_plan.hpp b/src/LowerPHY/DCT/odd_radix_plan.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..406d0b1e597f4d6abfbe7b53f44a5ef0de38cc39
--- /dev/null
+++ b/src/LowerPHY/DCT/odd_radix_plan.hpp
@@ -0,0 +1,91 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: <text>Copyright 2025 Arm Limited and/or its
+    affiliates <open-source-office@arm.com></text>
+    SPDX-License-Identifier: BSD-3-Clause
+*/
+#pragma once
+
+#include "dct_plan.hpp"
+#include "pod_vector.hpp"
+#include "unique_ptr.hpp"
+
+#include <cmath>
+#include <cstdio>
+
+namespace armral::dct {
+
+template<int Q, class Howmany>
+struct odd_radix_worker;
+
+template<int Q, class Howmany>
+class odd_radix_plan : public armral_dct_plan_t {
+public:
+  odd_radix_plan(int n, int howmany)
+    : m_howmany(howmany), m_n(n), m_d(n / Q), m_twid_t(n_cs * m_d),
+      m_twid_a(n_cs * n), m_plan(make_mixed_radix_plan(n / Q, Q * howmany)) {
+    /* Initialise twiddle factors as described in equations 9 and 10:
+       G. Bi and L. W. Yu, "DCT algorithms for composite sequence lengths," in
+       IEEE Transactions on Signal Processing, vol. 46, no. 3, pp. 554-562,
+       March 1998.
+
+       m_twid_t implements the tan-based factor. Even elements of m_twid_a
+       implement the cos-based factor in eqn. 9, odd elements implement the
+       sin-based factor in eqn. 10.  */
+    utils::pod_vector<double> tmp(m_twid_t.size());
+    for (int m = 0; m < n_cs; m++) {
+      for (int k = 0; k < m_d; k++) {
+        const double twid_idx = M_PI * k * (Q - 1 - 2 * m) / (2 * n);
+        m_twid_t[m * m_d + k] = static_cast<armral_half>(std::tan(twid_idx));
+        tmp[m * m_d + k] = std::cos(twid_idx);
+      }
+    }
+
+    for (int j = 0; j < Q; j++) {
+      for (int m = 0; m < n_cs; m++) {
+        const double twid_idx = M_PI * (Q - 1 - 2 * m) * j / (2 * Q);
+        for (int k = 0; k < m_d; k++) {
+          if (j % 2 == 0) {
+            m_twid_a[m_d * (j * n_cs + m) + k] = static_cast<armral_half>(
+                tmp[m * m_d + k] * std::cos(twid_idx) * std::pow(-1.0, j / 2));
+          } else {
+            m_twid_a[m_d * (j * n_cs + m) + k] =
+                static_cast<armral_half>(tmp[m * m_d + k] * std::sin(twid_idx) *
+                                         std::pow(-1.0, (j - 1) / 2));
+          }
+        }
+      }
+    }
+  }
+
+  void print_plan() const final {
+    printf("radix_%d_plan(n=%d, howmany=%d)\n", Q, m_n, m_howmany.v);
+    m_plan->print_plan();
+  }
+
+  armral_status execute(const armral_half *in, armral_half *out) const final {
+    utils::pod_vector<armral_half> buf(m_n * m_howmany.v);
+    odd_radix_worker<Q, Howmany>::prepare_sub_transform(in, buf, m_d,
+                                                        m_howmany);
+    m_plan->execute(buf.data(), buf.data());
+    odd_radix_worker<Q, Howmany>::extract_sub_transform(
+        out, buf, m_d, m_howmany, m_twid_t, m_twid_a);
+    return ARMRAL_SUCCESS;
+  }
+
+private:
+  // Highest index of S_n or C_n sub-transform
+  static constexpr int n_cs = (Q - 1) / 2;
+  const Howmany m_howmany;
+  const int m_n;
+  // N of the sub-transform
+  const int m_d;
+  utils::pod_vector<armral_half> m_twid_t;
+  utils::pod_vector<armral_half> m_twid_a;
+  utils::unique_ptr<armral_dct_plan_t> m_plan;
+};
+
+template<int Q>
+utils::unique_ptr<armral_dct_plan_t> create_odd_radix_plan(int n, int howmany);
+
+} // namespace armral::dct
diff --git a/src/LowerPHY/DCT/radix_2_plan.cpp b/src/LowerPHY/DCT/radix_2_plan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..46a05b1f9df19e1bf0988b559877622f6742153e
--- /dev/null
+++ b/src/LowerPHY/DCT/radix_2_plan.cpp
@@ -0,0 +1,77 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: <text>Copyright 2025 Arm Limited and/or its
+    affiliates <open-source-office@arm.com></text>
+    SPDX-License-Identifier: BSD-3-Clause
+*/
+#include "radix_2_plan.hpp"
+#include "dct_plan.hpp"
+#include "dynamic_value.hpp"
+#include "pragmas.hpp"
+#include "static_value.hpp"
+
+#include <cstdio>
+
+namespace armral::dct {
+
+template<class Howmany>
+radix_2_plan<Howmany>::radix_2_plan(int n, int howmany)
+  : m_n(n), m_howmany(howmany), m_twid(n / 2), m_buf(n * howmany),
+    // Use Tw as storage and working precision for the sub-plan
+    m_plan(make_mixed_radix_plan(n / 2, howmany * 2)) {
+  for (int i = 0; i < n / 2; i++) {
+    double m = i * 2 + 1;
+    m_twid[i] = static_cast<armral_half>(0.5 / std::cos(M_PI * m / (2 * n)));
+  }
+}
+
+template<class Howmany>
+armral_status radix_2_plan<Howmany>::execute(const armral_half *in,
+                                             armral_half *out) const {
+  /* Radix-2 decomposition as described in:
+     M. Perera, S., Silverio, D., Ogle, A. (2019). Efficient Split-Radix and
+     Radix-4 DCT Algorithms and Applications. Analysis of Experimental
+     Algorithms. SEA 2019. Lecture Notes in Computer Science(), vol 11544.
+     Springer, Cham.  */
+
+  for (int i = 0; i < m_n / 2; i++) {
+    ARMRAL_PRAGMA_AUTOVEC
+    for (int k = 0; k < m_howmany.v; k++) {
+      const auto a = in[k + m_howmany.v * i];
+      const auto b = in[k + m_howmany.v * (m_n - i - 1)];
+      m_buf[k + m_howmany.v * (2 * i)] = a + b;
+      m_buf[k + m_howmany.v * (2 * i + 1)] = (a - b) * m_twid[i];
+    }
+  }
+
+  m_plan->execute(m_buf.data(), out);
+
+  for (int i = 1; i < m_n - 1; i += 2) {
+    ARMRAL_PRAGMA_AUTOVEC
+    for (int k = 0; k < m_howmany.v; k++) {
+      out[m_howmany.v * i + k] += out[m_howmany.v * (i + 2) + k];
+    }
+  }
+  return ARMRAL_SUCCESS;
+}
+
+template<class Howmany>
+void radix_2_plan<Howmany>::print_plan() const {
+  printf("radix_2_plan(n=%d, howmany=%d)\n", m_n, m_howmany.v);
+  m_plan->print_plan();
+}
+
+utils::unique_ptr<armral_dct_plan_t> create_radix_2_plan(int n, int howmany) {
+  switch (howmany) {
+  case 1:
+    return utils::make_unique<radix_2_plan<static_value<1>>>(n, howmany);
+  case 2:
+    return utils::make_unique<radix_2_plan<static_value<2>>>(n, howmany);
+  case 3:
+    return utils::make_unique<radix_2_plan<static_value<3>>>(n, howmany);
+  default:
+    return utils::make_unique<radix_2_plan<dynamic_value>>(n, howmany);
+  }
+}
+
+} // namespace armral::dct
diff --git a/src/LowerPHY/DCT/radix_2_plan.hpp b/src/LowerPHY/DCT/radix_2_plan.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3b858050da674b42b8bd8a73858fb79251709a24
--- /dev/null
+++ b/src/LowerPHY/DCT/radix_2_plan.hpp
@@ -0,0 +1,33 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: <text>Copyright 2025 Arm Limited and/or its
+    affiliates <open-source-office@arm.com></text>
+    SPDX-License-Identifier: BSD-3-Clause
+*/
+#pragma once
+
+#include "dct_plan.hpp"
+#include "pod_vector.hpp"
+#include "unique_ptr.hpp"
+
+namespace armral::dct {
+
+template<class Howmany>
+class radix_2_plan : public armral_dct_plan_t {
+public:
+  radix_2_plan(int n, int howmany);
+
+  armral_status execute(const armral_half *in, armral_half *out) const final;
+  void print_plan() const override;
+
+private:
+  const int m_n;
+  const Howmany m_howmany;
+  utils::pod_vector<armral_half> m_twid;
+  utils::pod_vector<armral_half> m_buf;
+  utils::unique_ptr<armral_dct_plan_t> m_plan;
+};
+
+utils::unique_ptr<armral_dct_plan_t> create_radix_2_plan(int n, int howmany);
+
+} // namespace armral::dct
diff --git a/src/LowerPHY/DCT/radix_3_plan.cpp b/src/LowerPHY/DCT/radix_3_plan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fa14bc4c85a74ab7c9297922e1a405aedb73bb79
--- /dev/null
+++ b/src/LowerPHY/DCT/radix_3_plan.cpp
@@ -0,0 +1,113 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: <text>Copyright 2025 Arm Limited and/or its
+    affiliates <open-source-office@arm.com></text>
+    SPDX-License-Identifier: BSD-3-Clause
+*/
+#include "dynamic_value.hpp"
+#include "odd_radix_plan.hpp"
+#include "pragmas.hpp"
+#include "static_value.hpp"
+
+namespace armral::dct {
+
+template<typename Howmany>
+struct odd_radix_worker<3, Howmany> {
+  static void prepare_sub_transform(const armral_half *in,
+                                    const utils::pod_vector<armral_half> &buf,
+                                    int d, Howmany howmany) {
+    // Prepare the inputs for transforms A, C_0 and S_0 (the latter two are just
+    // referred to as c and s throughout
+    for (int i = 0; i < d; i++) {
+      ARMRAL_PRAGMA_AUTOVEC
+      for (int k = 0; k < howmany.v; k++) {
+        // x and y are inputs to the butterfly forming input sequences for c and
+        // s
+        const auto x = in[(3 * i + 0) * howmany.v + k];
+        const auto y = in[(3 * i + 2) * howmany.v + k];
+        const auto c_in = x + y;
+        auto s_in = x - y;
+        // (-1)^n term of eqn. 6b
+        if (i % 2 == 1) {
+          s_in = -s_in;
+        }
+
+        const auto a_in = in[(3 * i + 1) * howmany.v + k];
+
+        buf[(3 * i + 0) * howmany.v + k] = a_in;
+        buf[(3 * i + 1) * howmany.v + k] = c_in;
+        buf[(3 * i + 2) * howmany.v + k] = s_in;
+      }
+    }
+  };
+
+  static void
+  extract_sub_transform(armral_half *out, // NOLINT (LLVM bug 41393)
+                        const utils::pod_vector<armral_half> &buf, int d,
+                        Howmany howmany,
+                        const utils::pod_vector<armral_half> &twid_t,
+                        const utils::pod_vector<armral_half> &twid_a) {
+    // First element of each subsequence F_n (n = 0 to 2) requires minimal
+    // post-processing. In addition some twiddle factors can be elided as they
+    // are 1 or 0.  */
+    ARMRAL_PRAGMA_AUTOVEC
+    for (int k = 0; k < howmany.v; k++) {
+      const auto a = buf[0 * howmany.v + k];
+      const auto c = buf[1 * howmany.v + k];
+      const auto s = buf[2 * howmany.v + k];
+      out[k + 0 * d * howmany.v] = c + a;
+      out[k + 1 * d * howmany.v] = s * twid_a[d];
+      out[k + 2 * d * howmany.v] = c * static_cast<armral_half>(0.5) - a;
+    }
+
+    // Unrolled version of the loop over n_cs in equations 9 and 10
+    for (int i = 1; i < d; i++) {
+      ARMRAL_PRAGMA_AUTOVEC
+      for (int k = 0; k < howmany.v; k++) {
+        // c and s are combined in opposite directions - loop over both
+        // in both directions to fully unroll the sum
+        const auto c_fwd = buf[(i * 3 + 1) * howmany.v + k];
+        const auto s_fwd = buf[(i * 3 + 2) * howmany.v + k];
+        const auto c_bck = buf[((d - i) * 3 + 1) * howmany.v + k];
+        const auto s_bck = buf[((d - i) * 3 + 2) * howmany.v + k];
+        const auto a = buf[i * howmany.v * 3 + k];
+
+        out[(0 * d + i) * howmany.v + k] =
+            (c_fwd + s_bck * twid_t[i]) * twid_a[0 * d + i] + a;
+        out[(1 * d + i) * howmany.v + k] =
+            (s_fwd + c_bck * twid_t[i]) * twid_a[1 * d + i];
+        out[(2 * d + i) * howmany.v + k] =
+            (c_fwd + s_bck * twid_t[i]) * twid_a[2 * d + i] - a;
+      }
+    }
+
+    for (int j = 1; j < 3; j++) {
+      // eqn. 11
+      for (int i = 1; i < d; i++) {
+        ARMRAL_PRAGMA_AUTOVEC
+        for (int k = 0; k < howmany.v; k++) {
+          out[(d * j + i) * howmany.v + k] =
+              2 * out[(d * j + i) * howmany.v + k] -
+              out[(d * j - i) * howmany.v + k];
+        }
+      }
+    }
+  };
+};
+
+template<>
+utils::unique_ptr<armral_dct_plan_t> create_odd_radix_plan<3>(int n,
+                                                              int howmany) {
+  switch (howmany) {
+  case 1:
+    return utils::make_unique<odd_radix_plan<3, static_value<1>>>(n, howmany);
+  case 2:
+    return utils::make_unique<odd_radix_plan<3, static_value<2>>>(n, howmany);
+  case 3:
+    return utils::make_unique<odd_radix_plan<3, static_value<3>>>(n, howmany);
+  default:
+    return utils::make_unique<odd_radix_plan<3, dynamic_value>>(n, howmany);
+  }
+}
+
+} // namespace armral::dct
diff --git a/src/LowerPHY/DCT/radix_5_plan.cpp b/src/LowerPHY/DCT/radix_5_plan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e3de86e267a31e8b88afa856ee435ed2a9aee36f
--- /dev/null
+++ b/src/LowerPHY/DCT/radix_5_plan.cpp
@@ -0,0 +1,138 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: <text>Copyright 2025 Arm Limited and/or its
+    affiliates <open-source-office@arm.com></text>
+    SPDX-License-Identifier: BSD-3-Clause
+*/
+#include "dynamic_value.hpp"
+#include "odd_radix_plan.hpp"
+#include "pragmas.hpp"
+#include "static_value.hpp"
+
+namespace armral::dct {
+
+template<typename Howmany>
+struct odd_radix_worker<5, Howmany> {
+  static void prepare_sub_transform(const armral_half *in,
+                                    const utils::pod_vector<armral_half> &buf,
+                                    int d, Howmany howmany) {
+    // Prepare the inputs for transforms A, C_0, C_1, S_0 and S_1
+    for (int i = 0; i < d; i++) {
+      ARMRAL_PRAGMA_AUTOVEC
+      for (int k = 0; k < howmany.v; k++) {
+        const auto a_in = in[(5 * i + 2) * howmany.v + k];
+        const auto x0 = in[(5 * i + 0) * howmany.v + k];
+        const auto y0 = in[(5 * i + 4) * howmany.v + k];
+        const auto x1 = in[(5 * i + 1) * howmany.v + k];
+        const auto y1 = in[(5 * i + 3) * howmany.v + k];
+        const auto c0_in = x0 + y0;
+        auto s0_in = x0 - y0;
+        const auto c1_in = x1 + y1;
+        auto s1_in = x1 - y1;
+
+        if (i % 2 == 1) {
+          // (-1)^n term of eqn. 6b
+          s0_in = -s0_in;
+          s1_in = -s1_in;
+        }
+
+        buf[(5 * i + 0) * howmany.v + k] = a_in;
+        buf[(5 * i + 1) * howmany.v + k] = c0_in;
+        buf[(5 * i + 2) * howmany.v + k] = c1_in;
+        buf[(5 * i + 3) * howmany.v + k] = s0_in;
+        buf[(5 * i + 4) * howmany.v + k] = s1_in;
+      }
+    }
+  };
+
+  static void
+  extract_sub_transform(armral_half *out, // NOLINT (LLVM bug 41393)
+                        const utils::pod_vector<armral_half> &buf, int d,
+                        Howmany howmany,
+                        const utils::pod_vector<armral_half> &twid_t,
+                        const utils::pod_vector<armral_half> &twid_a) {
+    ARMRAL_PRAGMA_AUTOVEC
+    for (int k = 0; k < howmany.v; k++) {
+      const auto a = buf[0 * howmany.v + k];
+      const auto c0 = buf[1 * howmany.v + k];
+      const auto c1 = buf[2 * howmany.v + k];
+      const auto s0 = buf[3 * howmany.v + k];
+      const auto s1 = buf[4 * howmany.v + k];
+
+      // Twiddle factors are both 1:
+      out[0 * d * howmany.v + k] = c0 + c1 + a;
+      // Because of symmetry in twid_a some twids can be shared in the below
+      out[1 * d * howmany.v + k] = s0 * twid_a[d * 2] + s1 * twid_a[d * 3];
+      out[2 * d * howmany.v + k] = c0 * twid_a[d * 4] + c1 * twid_a[d * 5] - a;
+      out[3 * d * howmany.v + k] = s0 * twid_a[d * 3] - s1 * twid_a[d * 2];
+      out[4 * d * howmany.v + k] = a - c0 * twid_a[d * 5] - c1 * twid_a[d * 4];
+    }
+
+    // Unrolled version of the loop over n_cs in equations 9 and 10
+    for (int i = 1; i < d; i++) {
+      ARMRAL_PRAGMA_AUTOVEC
+      for (int k = 0; k < howmany.v; k++) {
+        const auto a = buf[(i * 5 + 0) * howmany.v + k];
+
+        const auto c0_fwd = buf[(i * 5 + 1) * howmany.v + k];
+        const auto c1_fwd = buf[(i * 5 + 2) * howmany.v + k];
+        const auto s0_fwd = buf[(i * 5 + 3) * howmany.v + k];
+        const auto s1_fwd = buf[(i * 5 + 4) * howmany.v + k];
+
+        const auto c0_bck = buf[((d - i) * 5 + 1) * howmany.v + k];
+        const auto c1_bck = buf[((d - i) * 5 + 2) * howmany.v + k];
+        const auto s0_bck = buf[((d - i) * 5 + 3) * howmany.v + k];
+        const auto s1_bck = buf[((d - i) * 5 + 4) * howmany.v + k];
+
+        out[(0 * d + i) * howmany.v + k] =
+            (c0_fwd + s0_bck * twid_t[0 + i]) * twid_a[i] +
+            (c1_fwd + s1_bck * twid_t[d + i]) * twid_a[d + i] + a;
+
+        out[(1 * d + i) * howmany.v + k] =
+            (s0_fwd + c0_bck * twid_t[0 + i]) * twid_a[d * 2 + i] +
+            (s1_fwd + c1_bck * twid_t[d + i]) * twid_a[d * 3 + i];
+
+        out[(2 * d + i) * howmany.v + k] =
+            (c0_fwd + s0_bck * twid_t[0 + i]) * twid_a[d * 4 + i] +
+            (c1_fwd + s1_bck * twid_t[d + i]) * twid_a[d * 5 + i] - a;
+
+        out[(3 * d + i) * howmany.v + k] =
+            (s0_fwd + c0_bck * twid_t[0 + i]) * twid_a[d * 6 + i] +
+            (s1_fwd + c1_bck * twid_t[d + i]) * twid_a[d * 7 + i];
+
+        out[(4 * d + i) * howmany.v + k] =
+            (c0_fwd + s0_bck * twid_t[0 + i]) * twid_a[d * 8 + i] +
+            (c1_fwd + s1_bck * twid_t[d + i]) * twid_a[d * 9 + i] + a;
+      }
+    }
+
+    for (int j = 1; j < 5; j++) {
+      // eqn. 11
+      for (int i = 1; i < d; i++) {
+        ARMRAL_PRAGMA_AUTOVEC
+        for (int k = 0; k < howmany.v; k++) {
+          out[(d * j + i) * howmany.v + k] =
+              2 * out[(d * j + i) * howmany.v + k] -
+              out[(d * j - i) * howmany.v + k];
+        }
+      }
+    }
+  };
+};
+
+template<>
+utils::unique_ptr<armral_dct_plan_t> create_odd_radix_plan<5>(int n,
+                                                              int howmany) {
+  switch (howmany) {
+  case 1:
+    return utils::make_unique<odd_radix_plan<5, static_value<1>>>(n, howmany);
+  case 2:
+    return utils::make_unique<odd_radix_plan<5, static_value<2>>>(n, howmany);
+  case 3:
+    return utils::make_unique<odd_radix_plan<5, static_value<3>>>(n, howmany);
+  default:
+    return utils::make_unique<odd_radix_plan<5, dynamic_value>>(n, howmany);
+  }
+}
+
+} // namespace armral::dct
diff --git a/src/LowerPHY/DCT/radix_7_plan.cpp b/src/LowerPHY/DCT/radix_7_plan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bbc21f2ce49b9f015693ebef4e6633a7aeaecbde
--- /dev/null
+++ b/src/LowerPHY/DCT/radix_7_plan.cpp
@@ -0,0 +1,178 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: <text>Copyright 2025 Arm Limited and/or its
+    affiliates <open-source-office@arm.com></text>
+    SPDX-License-Identifier: BSD-3-Clause
+*/
+#include "dynamic_value.hpp"
+#include "odd_radix_plan.hpp"
+#include "pragmas.hpp"
+#include "static_value.hpp"
+
+namespace armral::dct {
+
+template<typename Howmany>
+struct odd_radix_worker<7, Howmany> {
+  static void prepare_sub_transform(const armral_half *in,
+                                    const utils::pod_vector<armral_half> &buf,
+                                    int d, Howmany howmany) {
+    // Prepare the inputs for transforms A, C_n and S_n (n = 0 to 2)
+    for (int i = 0; i < d; i++) {
+      ARMRAL_PRAGMA_AUTOVEC
+      for (int k = 0; k < howmany.v; k++) {
+        const auto a_in = in[(7 * i + 3) * howmany.v + k];
+        const auto a0 = in[(7 * i + 0) * howmany.v + k];
+        const auto a1 = in[(7 * i + 1) * howmany.v + k];
+        const auto a2 = in[(7 * i + 2) * howmany.v + k];
+        const auto b0 = in[(7 * i + 6) * howmany.v + k];
+        const auto b1 = in[(7 * i + 5) * howmany.v + k];
+        const auto b2 = in[(7 * i + 4) * howmany.v + k];
+        const auto c0_in = a0 + b0;
+        auto s0_in = a0 - b0;
+        const auto c1_in = a1 + b1;
+        auto s1_in = a1 - b1;
+        const auto c2_in = a2 + b2;
+        auto s2_in = a2 - b2;
+
+        if (i % 2 == 1) {
+          // (-1)^n term of eqn. 6b
+          s0_in = -s0_in;
+          s1_in = -s1_in;
+          s2_in = -s2_in;
+        }
+
+        buf[(7 * i + 0) * howmany.v + k] = a_in;
+        buf[(7 * i + 1) * howmany.v + k] = c0_in;
+        buf[(7 * i + 2) * howmany.v + k] = c1_in;
+        buf[(7 * i + 3) * howmany.v + k] = c2_in;
+        buf[(7 * i + 4) * howmany.v + k] = s0_in;
+        buf[(7 * i + 5) * howmany.v + k] = s1_in;
+        buf[(7 * i + 6) * howmany.v + k] = s2_in;
+      }
+    }
+  };
+
+  static void
+  extract_sub_transform(armral_half *out, // NOLINT (LLVM bug 41393)
+                        const utils::pod_vector<armral_half> &buf, int d,
+                        Howmany howmany,
+                        const utils::pod_vector<armral_half> &twid_t,
+                        const utils::pod_vector<armral_half> &twid_a) {
+    ARMRAL_PRAGMA_AUTOVEC
+    for (int k = 0; k < howmany.v; k++) {
+      const auto a = buf[0 * howmany.v + k];
+      const auto c0 = buf[1 * howmany.v + k];
+      const auto c1 = buf[2 * howmany.v + k];
+      const auto c2 = buf[3 * howmany.v + k];
+      const auto s0 = buf[4 * howmany.v + k];
+      const auto s1 = buf[5 * howmany.v + k];
+      const auto s2 = buf[6 * howmany.v + k];
+
+      // Twiddle factors are both 1:
+      out[0 * d * howmany.v + k] = c0 + c1 + c2 + a;
+      // Because of symmetry in twid_a some twids can be shared in the below
+      out[1 * d * howmany.v + k] =
+          s0 * twid_a[3 * d] + s1 * twid_a[4 * d] + s2 * twid_a[5 * d];
+      out[2 * d * howmany.v + k] =
+          c0 * twid_a[6 * d] + c1 * twid_a[7 * d] + c2 * twid_a[8 * d] - a;
+      out[3 * d * howmany.v + k] =
+          s0 * twid_a[4 * d] - s1 * twid_a[5 * d] - s2 * twid_a[3 * d];
+      out[4 * d * howmany.v + k] =
+          a - c0 * twid_a[8 * d] - c1 * twid_a[6 * d] - c2 * twid_a[7 * d];
+      out[5 * d * howmany.v + k] =
+          s0 * twid_a[5 * d] - s1 * twid_a[3 * d] + s2 * twid_a[4 * d];
+      out[6 * d * howmany.v + k] =
+          c0 * twid_a[7 * d] + c1 * twid_a[8 * d] + c2 * twid_a[6 * d] - a;
+    }
+
+    // Unrolled version of the loop over n_cs in equations 9 and 10
+    for (int i = 1; i < d; i++) {
+      ARMRAL_PRAGMA_AUTOVEC
+      for (int k = 0; k < howmany.v; k++) {
+        const auto a = buf[(i * 7 + 0) * howmany.v + k];
+
+        const auto c0_fwd = buf[(i * 7 + 1) * howmany.v + k];
+        const auto c1_fwd = buf[(i * 7 + 2) * howmany.v + k];
+        const auto c2_fwd = buf[(i * 7 + 3) * howmany.v + k];
+        const auto s0_fwd = buf[(i * 7 + 4) * howmany.v + k];
+        const auto s1_fwd = buf[(i * 7 + 5) * howmany.v + k];
+        const auto s2_fwd = buf[(i * 7 + 6) * howmany.v + k];
+
+        const auto c0_bck = buf[((d - i) * 7 + 1) * howmany.v + k];
+        const auto c1_bck = buf[((d - i) * 7 + 2) * howmany.v + k];
+        const auto c2_bck = buf[((d - i) * 7 + 3) * howmany.v + k];
+        const auto s0_bck = buf[((d - i) * 7 + 4) * howmany.v + k];
+        const auto s1_bck = buf[((d - i) * 7 + 5) * howmany.v + k];
+        const auto s2_bck = buf[((d - i) * 7 + 6) * howmany.v + k];
+
+        const auto t0 = twid_t[0 * d + i];
+        const auto t1 = twid_t[1 * d + i];
+        const auto t2 = twid_t[2 * d + i];
+
+        out[(0 * d + i) * howmany.v + k] =
+            (c0_fwd + s0_bck * t0) * twid_a[0 * d + i] +
+            (c1_fwd + s1_bck * t1) * twid_a[1 * d + i] +
+            (c2_fwd + s2_bck * t2) * twid_a[2 * d + i] + a;
+
+        out[(1 * d + i) * howmany.v + k] =
+            (s0_fwd + c0_bck * t0) * twid_a[3 * d + i] +
+            (s1_fwd + c1_bck * t1) * twid_a[4 * d + i] +
+            (s2_fwd + c2_bck * t2) * twid_a[5 * d + i];
+
+        out[(2 * d + i) * howmany.v + k] =
+            (c0_fwd + s0_bck * t0) * twid_a[6 * d + i] +
+            (c1_fwd + s1_bck * t1) * twid_a[7 * d + i] +
+            (c2_fwd + s2_bck * t2) * twid_a[8 * d + i] - a;
+
+        out[(3 * d + i) * howmany.v + k] =
+            (s0_fwd + c0_bck * t0) * twid_a[9 * d + i] +
+            (s1_fwd + c1_bck * t1) * twid_a[10 * d + i] +
+            (s2_fwd + c2_bck * t2) * twid_a[11 * d + i];
+
+        out[(4 * d + i) * howmany.v + k] =
+            (c0_fwd + s0_bck * t0) * twid_a[12 * d + i] +
+            (c1_fwd + s1_bck * t1) * twid_a[13 * d + i] +
+            (c2_fwd + s2_bck * t2) * twid_a[14 * d + i] + a;
+
+        out[(5 * d + i) * howmany.v + k] =
+            (s0_fwd + c0_bck * t0) * twid_a[15 * d + i] +
+            (s1_fwd + c1_bck * t1) * twid_a[16 * d + i] +
+            (s2_fwd + c2_bck * t2) * twid_a[17 * d + i];
+
+        out[(6 * d + i) * howmany.v + k] =
+            (c0_fwd + s0_bck * t0) * twid_a[18 * d + i] +
+            (c1_fwd + s1_bck * t1) * twid_a[19 * d + i] +
+            (c2_fwd + s2_bck * t2) * twid_a[20 * d + i] - a;
+      }
+    }
+
+    for (int j = 1; j < 7; j++) {
+      // eqn. 11
+      for (int i = 1; i < d; i++) {
+        ARMRAL_PRAGMA_AUTOVEC
+        for (int k = 0; k < howmany.v; k++) {
+          out[(d * j + i) * howmany.v + k] =
+              2 * out[(d * j + i) * howmany.v + k] -
+              out[(d * j - i) * howmany.v + k];
+        }
+      }
+    }
+  };
+};
+
+template<>
+utils::unique_ptr<armral_dct_plan_t> create_odd_radix_plan<7>(int n,
+                                                              int howmany) {
+  switch (howmany) {
+  case 1:
+    return utils::make_unique<odd_radix_plan<7, static_value<1>>>(n, howmany);
+  case 2:
+    return utils::make_unique<odd_radix_plan<7, static_value<2>>>(n, howmany);
+  case 3:
+    return utils::make_unique<odd_radix_plan<7, static_value<3>>>(n, howmany);
+  default:
+    return utils::make_unique<odd_radix_plan<7, dynamic_value>>(n, howmany);
+  }
+}
+
+} // namespace armral::dct
diff --git a/src/LowerPHY/DCT/static_value.hpp b/src/LowerPHY/DCT/static_value.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..856e44036823e1574747c77045fc70baa8ac4d6f
--- /dev/null
+++ b/src/LowerPHY/DCT/static_value.hpp
@@ -0,0 +1,18 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: <text>Copyright 2025 Arm Limited and/or its
+    affiliates <open-source-office@arm.com></text>
+    SPDX-License-Identifier: BSD-3-Clause
+*/
+#pragma once
+
+template<int V>
+class static_value {
+  /* Utility class to allow DCT stages to provide specialisations for certain
+     values of some parameter while still allowing arbitrary values of that
+     parameter to be chosen at runtime (via the partner class dynamic_value). */
+public:
+  static_value(int /* unused */) {}
+
+  static constexpr int v = V;
+};
diff --git a/test/LowerPHY/DCT/main.cpp b/test/LowerPHY/DCT/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4ce45a407e8e96a834627aaf2bb1256b20a1c5cd
--- /dev/null
+++ b/test/LowerPHY/DCT/main.cpp
@@ -0,0 +1,130 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: <text>Copyright 2025 Arm Limited and/or its
+    affiliates <open-source-office@arm.com></text>
+    SPDX-License-Identifier: BSD-3-Clause
+*/
+#include "armral.h"
+#include "f16_utils.hpp"
+
+#include <cmath>
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+namespace {
+
+void do_reference_transform(int n, int howmany, const armral_half *in,
+                            int istride, int idist, armral_half *out,
+                            int ostride, int odist) {
+  // Calculate C, the matrix for N-point DCT-III
+  std::vector<double> mat(n * n);
+  for (int i = 0; i < n; i++) {
+    mat[i] = 0.5;
+    for (int j = 1; j < n; j++) {
+      mat[j * n + i] = std::cos(M_PI * (0.5 + i) * j / n);
+    }
+  }
+
+  // Apply it by naive gemv, with scale by 1/N
+  for (int k = 0; k < howmany; k++) {
+    for (int j = 0; j < n; j++) {
+      double y = 0;
+      for (int i = 0; i < n; i++) {
+        y += in[i * istride + k * idist] * mat[i * n + j];
+      }
+      out[j * ostride + k * odist] = static_cast<armral_half>(y / n);
+    }
+  }
+}
+
+bool dct_test(int n, int howmany, int istride, int idist, int ostride,
+              int odist) {
+  // Machine epsilon scaled by transform complexity
+  const double base_tol = armral::utils::eps<armral_half>() * n * std::log2(n);
+  bool passed = true;
+
+  const auto verify_values_match =
+      [&passed, howmany, istride, idist, ostride, odist,
+       n](const std::vector<armral_half> &res,
+          const std::vector<armral_half> &ref, int n_samples, double tol,
+          int ref_stride, int ref_dist) {
+        for (int k = 0; k < howmany; k++) {
+          for (int i = 0; i < n_samples; i++) {
+            const double refx = ref[i * ref_stride + k * ref_dist];
+            const double resx = res[i * ostride + k * odist];
+            const double err = std::abs((refx - resx) / refx);
+
+            if (err > tol) {
+              if (passed) {
+                // First failure: print table header
+                passed = false;
+                std::cerr << "Error! [DCT] n=" << n << " howmany=" << howmany
+                          << " istride=" << istride << " idist=" << idist
+                          << " ostride=" << ostride << " odist=" << odist
+                          << " tol=" << tol << '\n';
+                std::cerr << "k\ti\tres\tref\terr\n";
+              }
+              std::cerr << k << '\t' << i << '\t' << resx << '\t' << refx
+                        << '\t' << err << '\n';
+            }
+          }
+        }
+      };
+
+  // Initialise input values
+  armral::utils::f16_random random;
+  const auto isize = std::max(howmany, istride) * std::max(n, idist);
+  const auto x = random.vector(isize, 1, 2);
+
+  // Do the DCT into y
+  const auto osize = std::max(howmany, ostride) * std::max(n, odist);
+  std::vector<armral_half> y(osize);
+  armral_dct_plan_t *plan;
+  armral_dct_create_batch_plan_f16(&plan, n, howmany, istride, idist, ostride,
+                                   odist);
+  armral_dct_execute_f16(plan, x.data(), y.data());
+  armral_dct_destroy_plan_f16(&plan);
+
+  // First element of DCT should be the sum of the input sequence scaled by 2
+  std::vector<armral_half> sums(howmany);
+  for (int k = 0; k < howmany; k++) {
+    double sum = 0;
+    for (int i = 0; i < n; i++) {
+      sum += x[i * istride + k * idist];
+    }
+    sums[k] = static_cast<armral_half>(sum) * 2;
+  }
+  // ref_stride is unused in the following call because n_samples is 1
+  verify_values_match(y, sums, 1, base_tol, 1, 1);
+
+  // Then do inverse DCT and compare with the input sequence
+  std::vector<armral_half> should_match_x(n * howmany);
+  do_reference_transform(n, howmany, y.data(), ostride, odist,
+                         should_match_x.data(), ostride, odist);
+  // Double tolerance as we have done 2 transforms
+  verify_values_match(x, should_match_x, n, 2 * base_tol, ostride, odist);
+
+  return passed;
+}
+
+bool do_interleaved_test(int n, int howmany) {
+  return dct_test(n, howmany, howmany, 1, howmany, 1);
+}
+
+} // namespace
+
+int main() {
+  bool passed = true;
+
+  const std::vector<int> ns{2,  3,  4,  5,  6,   7,   8,      9,
+                            12, 25, 49, 54, 896, 300, 81 * 8, 49 * 5 * 3};
+  const std::vector<int> howmanys{1, 2, 4, 7, 14};
+
+  for (const int n : ns) {
+    for (const int howmany : howmanys) {
+      passed &= do_interleaved_test(n, howmany);
+    }
+  }
+  exit(passed ? EXIT_SUCCESS : EXIT_FAILURE);
+}
diff --git a/utils/f16_utils.hpp b/utils/f16_utils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d1949f40c97405839ea9bf8b0a78abebf2843693
--- /dev/null
+++ b/utils/f16_utils.hpp
@@ -0,0 +1,59 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: <text>Copyright 2020-2025 Arm Limited and/or its
+    affiliates <open-source-office@arm.com></text>
+    SPDX-License-Identifier: BSD-3-Clause
+*/
+#pragma once
+
+#include "armral.h"
+#include "rng.hpp"
+
+#include <vector>
+
+namespace armral::utils {
+
+class f16_random : public base_random<armral_half, armral_half> {
+public:
+  f16_random(std::initializer_list<uint64_t> seeds = {42})
+    : base_random<armral_half, armral_half>(seeds) {}
+
+  static constexpr armral_half default_min = -1;
+  static constexpr armral_half default_max = 1;
+
+  armral_half one(armral_half min = default_min,
+                  armral_half max = default_max) {
+    return rand(min, max);
+  }
+
+  std::vector<armral_half> vector(size_t len, armral_half min = default_min,
+                                  armral_half max = default_max) {
+    return base_random<armral_half, armral_half>::vector_impl(len, min, max);
+  }
+};
+
+template<typename T>
+struct float_uint;
+
+template<>
+struct float_uint<armral_half> {
+  using type = uint16_t;
+};
+
+template<typename T>
+using float_uint_t = typename float_uint<T>::type;
+
+// Get machine epsilon b^-(p-1)
+template<typename T>
+inline T eps() {
+  union {
+    float_uint_t<T> ival;
+    T fval;
+  } val;
+
+  val.fval = (T)1;
+  ++val.ival;
+  return val.fval - (T)1;
+}
+
+} // namespace armral::utils
diff --git a/utils/pod_vector.hpp b/utils/pod_vector.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a1c48a307160d3b4abbdf84ad186f4b19f7751c2
--- /dev/null
+++ b/utils/pod_vector.hpp
@@ -0,0 +1,511 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: <text>Copyright 2025 Arm Limited and/or its
+    affiliates <open-source-office@arm.com></text>
+    SPDX-License-Identifier: BSD-3-Clause
+*/
+#pragma once
+
+#include "reallocator.hpp"
+
+#include <algorithm>
+#include <cassert>
+#include <complex>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iterator>
+#include <type_traits>
+#include <vector>
+
+namespace armral::utils {
+
+template<typename T>
+constexpr bool is_input_iterator =
+    std::is_base_of_v<std::input_iterator_tag,
+                      typename std::iterator_traits<T>::iterator_category>;
+
+/** A vector-like container for value types that will survive not having a
+ * constructor called (e.g. float/double/std::complex). Calling .resize() on
+ * this container will leave any newly allocated data uninitialised, unlike
+ * std::vector which will default-initialise it.
+ */
+template<typename T, typename R = reallocator>
+class pod_vector {
+public:
+  using size_type = std::size_t;
+  using value_type = T;
+  using pointer = T *;
+  using const_pointer = const T *;
+  using iterator = pointer;
+  using const_iterator = const_pointer;
+  using reference = T &;
+  using reverse_iterator = std::reverse_iterator<iterator>;
+  using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+
+private:
+  static constexpr size_type growth_factor{2};
+
+  value_type *m_data = nullptr;
+  size_type m_capacity = 0;
+  size_type m_size = 0;
+  R m_realloc;
+
+  template<typename T2, typename R2>
+  friend class pod_vector;
+
+  /**
+   * Allocates or reallocates (copying the data) the data_pointer to the
+   * specified size (in number of elements of T), updating the capacity in
+   * the process.
+   */
+  void realloc(size_type size) {
+    if (size != m_capacity) {
+      m_data = static_cast<T *>(m_realloc.reallocate(m_data, sizeof(T) * size));
+      m_capacity = size;
+    }
+  }
+
+public:
+  pod_vector(const R &realloc = R()) : m_realloc{realloc} {}
+
+  pod_vector(const pod_vector<T, R> &other) {
+    *this = other;
+  }
+
+  pod_vector(pod_vector<T, R> &&other) {
+    *this = std::move(other);
+  }
+
+  pod_vector(size_type size, const R &realloc = R()) : m_realloc{realloc} {
+    resize(size);
+  }
+
+  pod_vector(size_type size, const T &value, const R &realloc = R())
+    : m_realloc{realloc} {
+    assign(size, value);
+  }
+
+  template<typename It1, typename = std::enable_if_t<is_input_iterator<It1>>>
+  pod_vector(It1 first, It1 last, const R &realloc = R()) : m_realloc{realloc} {
+    assign(first, last);
+  }
+
+  ~pod_vector() {
+    assert(m_size <= m_capacity);
+    assert(m_data || m_capacity == 0);
+    if (m_data) {
+      m_realloc.deallocate(m_data);
+    }
+  }
+
+  pod_vector<T, R> &operator=(pod_vector<T, R> &&other) {
+    swap(other);
+    return *this;
+  }
+
+  pod_vector<T, R> &operator=(const pod_vector<T, R> &other) {
+    if (&other == this) {
+      return *this;
+    }
+    realloc(other.m_capacity);
+    m_size = other.m_size;
+    std::memcpy((void *)m_data, (const void *)other.m_data, sizeof(T) * m_size);
+    m_realloc = other.m_realloc;
+    return *this;
+  }
+
+  void assign(size_type size, const T &value) {
+    resize(size);
+    std::fill_n(m_data, size, value);
+  }
+
+  template<typename It1, typename = std::enable_if_t<is_input_iterator<It1>>>
+  void assign(It1 first, It1 last) {
+    resize(std::distance(first, last));
+    std::copy(first, last, m_data);
+  }
+
+  reference at(size_type pos) {
+    assert(pos >= 0);
+    assert(pos < m_size);
+    assert(m_data);
+    return m_data[pos];
+  }
+
+  reference at(size_type pos) const {
+    assert(pos >= 0);
+    assert(pos < m_size);
+    assert(m_data);
+    return m_data[pos];
+  }
+
+  reference operator[](size_type pos) {
+    return m_data[pos];
+  }
+
+  reference operator[](size_type pos) const {
+    return m_data[pos];
+  }
+
+  reference front() {
+    return at(0);
+  }
+
+  reference front() const {
+    return at(0);
+  }
+
+  reference back() {
+    return at(m_size - 1);
+  }
+
+  reference back() const {
+    return at(m_size - 1);
+  }
+
+  pointer data() {
+    return m_data;
+  }
+
+  const_pointer data() const {
+    return m_data;
+  }
+
+  iterator begin() {
+    return m_data;
+  }
+
+  const_iterator begin() const {
+    return m_data;
+  }
+
+  const_iterator cbegin() const {
+    return m_data;
+  }
+
+  iterator end() {
+    return m_data + m_size;
+  }
+
+  const_iterator end() const {
+    return m_data + m_size;
+  }
+
+  const_iterator cend() const {
+    return m_data + m_size;
+  }
+
+  reverse_iterator rbegin() {
+    return {end()};
+  }
+
+  const_reverse_iterator rbegin() const {
+    return {end()};
+  }
+
+  const_reverse_iterator crbegin() const {
+    return {cend()};
+  }
+
+  reverse_iterator rend() {
+    return {begin()};
+  }
+
+  const_reverse_iterator rend() const {
+    return {begin()};
+  }
+
+  const_reverse_iterator crend() const {
+    return {cbegin()};
+  }
+
+  bool empty() const {
+    return m_size == 0;
+  }
+
+  size_type size() const {
+    return m_size;
+  }
+
+  size_type capacity() const {
+    return m_capacity;
+  }
+
+  void reserve(size_type new_cap) {
+    if (new_cap <= m_capacity) {
+      return;
+    }
+    realloc(new_cap);
+    assert(m_data);
+  }
+
+  void shrink_to_fit() {
+    realloc(m_size);
+    assert(m_data || m_size == 0);
+  }
+
+  void resize(size_type count) {
+    realloc(count);
+    assert(m_data || count == 0);
+    m_size = count;
+  }
+
+  void resize(size_type count, const value_type &value) {
+    realloc(count);
+    assert(m_data || count == 0);
+    if (count > m_size) {
+      std::fill(m_data + m_size, m_data + count, value);
+    }
+    m_size = count;
+  }
+
+  void clear() {
+    resize(0);
+  }
+
+  void swap(pod_vector<T, R> &other) {
+    std::swap(m_data, other.m_data);
+    std::swap(m_capacity, other.m_capacity);
+    std::swap(m_size, other.m_size);
+  }
+
+  void push_back(T value) {
+    if (m_size == m_capacity) {
+      size_type new_capacity = m_capacity == 0 ? 1 : m_capacity * growth_factor;
+      realloc(new_capacity);
+    }
+    m_data[m_size++] = std::move(value);
+  }
+};
+
+/** A wrapper around void* equivalent roughly to void&.
+ *  void& is disallowed by the language, but if all we want
+ *  is to take the address of it there's no problem. */
+class void_ref {
+  void *m_ptr;
+
+public:
+  void_ref(void *ptr) : m_ptr(ptr) {}
+
+  void *operator&() {
+    return m_ptr;
+  }
+
+  const void *operator&() const {
+    return m_ptr;
+  }
+};
+
+/** A vector-like container for unknown value types (roughly a
+ *  pod_vector<std::any>). This allows us to keep the memory management bits
+ *  that we like from pod_vector while not caring about the exact underlying
+ *  type.
+ */
+template<typename R>
+class pod_vector<void, R> {
+public:
+  using size_type = std::size_t;
+  using pointer = void *;
+  using const_pointer = const void *;
+  using reference = void_ref;
+
+private:
+  void *m_data = nullptr;
+  size_type m_capacity = 0;
+  size_type m_size = 0;
+  R m_realloc;
+
+  /** In addition to the parameters from a standard pod_vector, we must also
+   *  keep the size of each element, since we don't know it otherwise. */
+  size_type m_elem_size = 0;
+
+public:
+  pod_vector() = default;
+
+  template<typename T>
+  pod_vector(const pod_vector<T, R> &other) {
+    *this = other;
+  }
+
+  template<typename T>
+  pod_vector(pod_vector<T, R> &&other) {
+    *this = std::move(other);
+  }
+
+  pod_vector(const pod_vector<void, R> &other) {
+    *this = other;
+  }
+
+  pod_vector(pod_vector<void, R> &&other) {
+    *this = std::move(other);
+  }
+
+  /** We can move into this from any other pod_vector, since we can know the
+   *  element size by doing sizeof(T) */
+  template<typename T>
+  pod_vector<void, R> &operator=(pod_vector<T, R> &&other) {
+    if ((void *)this == (void *)&other) {
+      return *this;
+    }
+    if (m_data) {
+      m_realloc.deallocate(m_data);
+    }
+    m_data = other.m_data;
+    m_capacity = other.m_capacity;
+    m_size = other.m_size;
+    m_elem_size = sizeof(T);
+    other.m_data = nullptr;
+    other.m_capacity = other.m_size = 0;
+    return *this;
+  }
+
+  template<typename T>
+  pod_vector<void, R> &operator=(const pod_vector<T, R> &other) {
+    if ((void *)this == (void *)&other) {
+      return *this;
+    }
+    if (other.m_capacity * sizeof(T) != m_capacity * m_elem_size) {
+      m_data = m_realloc.reallocate(m_data, other.m_capacity * sizeof(T));
+    }
+    m_capacity = other.m_capacity;
+    m_size = other.m_size;
+    m_elem_size = sizeof(T);
+    std::memcpy(m_data, other.m_data, m_elem_size * m_size);
+    return *this;
+  }
+
+  /** We can move into this from any other pod_vector<void, R>, since we already
+   *  have the (original) element size. */
+  pod_vector<void, R> &operator=(pod_vector<void, R> &&other) {
+    if (this == &other) {
+      return *this;
+    }
+    if (m_data != nullptr) {
+      m_realloc.deallocate(m_data);
+    }
+    m_data = other.m_data;
+    m_capacity = other.m_capacity;
+    m_size = other.m_size;
+    m_elem_size = other.m_elem_size;
+    other.m_data = nullptr;
+    other.m_capacity = other.m_size = other.m_elem_size = 0;
+    return *this;
+  }
+
+  pod_vector<void, R> &operator=(const pod_vector<void, R> &other) {
+    if (&other == this) {
+      return *this;
+    }
+    if (other.m_capacity * other.m_elem_size != m_capacity * m_elem_size) {
+      m_data =
+          m_realloc.reallocate(m_data, other.m_capacity * other.m_elem_size);
+    }
+    m_capacity = other.m_capacity;
+    m_size = other.m_size;
+    m_elem_size = other.m_elem_size;
+    std::memcpy(m_data, other.m_data, m_elem_size * m_size);
+    return *this;
+  }
+
+  ~pod_vector() {
+    assert(m_size <= m_capacity);
+    assert(m_data || m_capacity == 0);
+    if (m_data != nullptr) {
+      m_realloc.deallocate(m_data);
+    }
+  }
+
+  reference at(size_type pos) {
+    assert(pos >= 0);
+    assert(pos < m_size);
+    assert(m_data);
+    uintptr_t data = reinterpret_cast<uintptr_t>(m_data);
+    return reinterpret_cast<void *>(data + pos * m_elem_size);
+  }
+
+  reference at(size_type pos) const {
+    assert(pos >= 0);
+    assert(pos < m_size);
+    assert(m_data);
+    uintptr_t data = reinterpret_cast<uintptr_t>(m_data);
+    return reinterpret_cast<void *>(data + pos * m_elem_size);
+  }
+
+  reference operator[](size_type pos) {
+    uintptr_t data = reinterpret_cast<uintptr_t>(m_data);
+    return reinterpret_cast<void *>(data + pos * m_elem_size);
+  }
+
+  reference operator[](size_type pos) const {
+    uintptr_t data = reinterpret_cast<uintptr_t>(m_data);
+    return reinterpret_cast<void *>(data + pos * m_elem_size);
+  }
+
+  reference front() {
+    return at(0);
+  }
+
+  reference front() const {
+    return at(0);
+  }
+
+  reference back() {
+    return at(m_size - 1);
+  }
+
+  reference back() const {
+    return at(m_size - 1);
+  }
+
+  pointer data() {
+    return m_data;
+  }
+
+  const_pointer data() const {
+    return m_data;
+  }
+
+  bool empty() const {
+    return m_size == 0;
+  }
+
+  size_type size() const {
+    return m_size;
+  }
+
+  size_type capacity() const {
+    return m_capacity;
+  }
+};
+
+template<typename V>
+struct is_vec_type : std::false_type {};
+
+template<typename T>
+struct is_vec_type<std::vector<T>> : std::true_type {};
+
+template<typename T, typename R>
+struct is_vec_type<armral::utils::pod_vector<T, R>> : std::true_type {};
+
+/**
+ * Utility to help check that a template parameter is a vector type
+ * (either std::vector or @ref armral::utils::pod_vector for now)
+ */
+template<typename V>
+constexpr bool is_vec_type_v = is_vec_type<V>::value;
+
+template<typename T>
+struct has_complex_value_type : std::false_type {};
+
+template<typename T>
+struct has_complex_value_type<std::vector<std::complex<T>>> : std::true_type {};
+
+template<typename T, typename R>
+struct has_complex_value_type<armral::utils::pod_vector<std::complex<T>, R>>
+  : std::true_type {};
+
+template<typename T>
+constexpr bool has_complex_value_type_v = has_complex_value_type<T>::value;
+
+} // namespace armral::utils
diff --git a/utils/pragmas.hpp b/utils/pragmas.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c32de15c1ec7e28d988fd5cdafd178db3da5f027
--- /dev/null
+++ b/utils/pragmas.hpp
@@ -0,0 +1,17 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: <text>Copyright 2020-2025 Arm Limited and/or its
+    affiliates <open-source-office@arm.com></text>
+    SPDX-License-Identifier: BSD-3-Clause
+*/
+#pragma once
+
+#if __clang__
+
+#define ARMRAL_PRAGMA_AUTOVEC _Pragma("clang loop vectorize(enable)")
+
+#else
+
+#define ARMRAL_PRAGMA_AUTOVEC _Pragma("GCC ivdep")
+
+#endif
diff --git a/utils/reallocator.hpp b/utils/reallocator.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..63a5e2f2701020907ccadc1de0c474b1bdd74f86
--- /dev/null
+++ b/utils/reallocator.hpp
@@ -0,0 +1,39 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: <text>Copyright 2025 Arm Limited and/or its
+    affiliates <open-source-office@arm.com></text>
+    SPDX-License-Identifier: BSD-3-Clause
+*/
+#pragma once
+
+#include <cstdlib>
+
+namespace armral::utils {
+
+/** A derivative of the standard allocator interface which offers reallocation
+ * of previously allocated memory. std::realloc is used for all reallocations,
+ * and like std::realloc an exact number of bytes must be specified for
+ * reallocation.
+ */
+class reallocator {
+public:
+  using pointer = void *;
+  using const_pointer = const pointer;
+  using size_type = std::size_t;
+
+  constexpr reallocator() noexcept = default;
+
+  constexpr reallocator(const reallocator &other) noexcept = default;
+
+  ~reallocator() = default;
+
+  static pointer reallocate(const_pointer ptr, size_type new_size) {
+    return static_cast<pointer>(std::realloc(ptr, new_size));
+  }
+
+  static void deallocate(const_pointer ptr) {
+    std::free(ptr);
+  }
+};
+
+} // namespace armral::utils
diff --git a/utils/rng.cpp b/utils/rng.cpp
index 23e03edaf64b785ca024bd3690cca74cbb547de1..1211019eecd526b07a9f7bffc16f1d339b694f90 100644
--- a/utils/rng.cpp
+++ b/utils/rng.cpp
@@ -6,6 +6,7 @@
 */
 #include <arm_neon.h>
 
+#include "armral.h"
 #include "rng.hpp"
 
 namespace armral::utils {
@@ -42,6 +43,13 @@ float32_t linear_congruential_generator::one<float32_t>(random_state *state) {
   return (float32_t)x / 0x80000000U;
 }
 
+template<>
+armral_half
+linear_congruential_generator::one<armral_half>(random_state *state) {
+  return static_cast<armral_half>(
+      linear_congruential_generator::one<float32_t>(state));
+}
+
 template<>
 double linear_congruential_generator::one<double>(random_state *state) {
   auto x = lcg_step(state->seed);
diff --git a/utils/rng.hpp b/utils/rng.hpp
index d366a8ed7840558b5529c3bae3eedc680da3b0a3..a2da3d02341e3be596138d1ee2a801281917afcd 100644
--- a/utils/rng.hpp
+++ b/utils/rng.hpp
@@ -7,6 +7,8 @@
 
 #pragma once
 
+#include "armral.h"
+
 #include <cstdint>
 #include <cstdlib>
 #include <initializer_list>
@@ -47,8 +49,9 @@ public:
    * @param[out]    max     The upper bound of the generated value.
    * @returns a pseudo-randomly generated floating-point value.
    */
-  template<typename T,
-           std::enable_if_t<std::is_floating_point_v<T>, bool> = true>
+  template<typename T, std::enable_if_t<std::is_floating_point_v<T> ||
+                                            std::is_same_v<T, armral_half>,
+                                        bool> = true>
   T one(random_state *state, const T min, const T max) {
     return (max - min) * one<T>(state) + min;
   }
diff --git a/utils/unique_ptr.hpp b/utils/unique_ptr.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3e6f3f6c2da94f168ca83dd69fe11f127cd4dd73
--- /dev/null
+++ b/utils/unique_ptr.hpp
@@ -0,0 +1,74 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: <text>Copyright 2025 Arm Limited and/or its
+    affiliates <open-source-office@arm.com></text>
+    SPDX-License-Identifier: BSD-3-Clause
+*/
+#pragma once
+
+#include <cstdlib>
+#include <new>
+#include <utility>
+
+namespace armral::utils {
+
+template<class T>
+class unique_ptr {
+  /* Custom minimal implementation of unique_ptr. std::unique_ptr may invoke
+     operator new, which introduces dependency on libstdc++. This version uses
+     malloc and placement new, which has no such dependency.  */
+public:
+  explicit unique_ptr(T *ptr) noexcept : m_ptr(ptr) {}
+
+  /* Delete copy-assignment and copy-constructor - if you can copy it then it's
+     not unique!  */
+  unique_ptr(const unique_ptr &) = delete;
+  unique_ptr &operator=(const unique_ptr &) = delete;
+
+  /* Converting constructor - this is required for implicit conversion from
+     unique_ptr<Derived> to unique_ptr<Base>.  */
+  template<class U, std::enable_if_t<std::is_base_of_v<T, U> &&
+                                         std::is_convertible_v<U *, T *>,
+                                     bool> = true>
+  unique_ptr(unique_ptr<U> &&other) noexcept : m_ptr{other.release()} {}
+
+  unique_ptr(unique_ptr<T> &&other) noexcept : m_ptr{other.release()} {}
+
+  ~unique_ptr() {
+    // Check for null ptr - this may have been moved or released
+    if (m_ptr) {
+      m_ptr->~T();
+      free(m_ptr);
+    }
+  }
+
+  T *release() noexcept {
+    T *copy = m_ptr;
+    m_ptr = nullptr;
+    return copy;
+  }
+
+  T *get() const noexcept {
+    return m_ptr;
+  }
+
+  explicit operator bool() const noexcept {
+    return m_ptr;
+  }
+
+  T *operator->() const noexcept {
+    return m_ptr;
+  }
+
+private:
+  T *m_ptr;
+};
+
+template<class T, class... Args>
+unique_ptr<T> make_unique(Args &&...args) {
+  T *buf = (T *)malloc(sizeof(T));
+  T *obj = new (buf) T(std::forward<Args>(args)...);
+  return unique_ptr(obj);
+}
+
+} // namespace armral::utils