From 5235deaa78da83f6d344a4883b54cc29e85176e7 Mon Sep 17 00:00:00 2001
From: Johan Gunnarsson <johan.gunnarsson@arm.com>
Date: Wed, 26 Mar 2025 18:14:03 +0100
Subject: [PATCH] MLBEDSW-9531: Use tensor alignment flag in Regor

This flag is used when allocating scratch memory and not using
separate IO regions. Read only memory and staging memory is
never accessed from CPU keep using the default NPU alignment
for those tensors.

Signed-off-by: Johan Gunnarsson <johan.gunnarsson@arm.com>
Change-Id: I0977e671b1b2948781e5dd52ba686019b1ffa921
---
 ethosu/regor/compiler/compiler.cpp  |  6 +++++-
 ethosu/regor/compiler/scheduler.cpp | 28 +++++++++++++++++++++-------
 ethosu/regor/compiler/scheduler.hpp |  3 ++-
 ethosu/vela/vela.py                 |  5 ++++-
 4 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/ethosu/regor/compiler/compiler.cpp b/ethosu/regor/compiler/compiler.cpp
index 9cb6c7de..9b972fd5 100644
--- a/ethosu/regor/compiler/compiler.cpp
+++ b/ethosu/regor/compiler/compiler.cpp
@@ -178,7 +178,11 @@ bool Compiler::ParseOptions(const char *text, size_t size)
         }
         else if ( section == "scheduler" )
         {
-            ParseSchedulerOptions(_schedulerOptions, reader);
+            if ( !ParseSchedulerOptions(_schedulerOptions, reader) )
+            {
+                SetLastError(fmt::format("Error parsing [{}]", section));
+                return false;
+            }
         }
         else if ( section == "graph" )
         {
diff --git a/ethosu/regor/compiler/scheduler.cpp b/ethosu/regor/compiler/scheduler.cpp
index 926c0232..112b9486 100644
--- a/ethosu/regor/compiler/scheduler.cpp
+++ b/ethosu/regor/compiler/scheduler.cpp
@@ -50,7 +50,7 @@ namespace regor
 {
 
 constexpr int AllocationQuantum = 16;
-constexpr int AlignmentQuantum = 16;
+constexpr int NPUTensorAlignment = 16;
 
 static Shape GetShapeForFormat(const Shape &shape, TensorFormat format)
 {
@@ -828,11 +828,13 @@ void Scheduler::MoveConstantData(Schedule *refSchedule)
 bool Scheduler::AllocateAddresses(Schedule *schedule)
 {
     const auto verbose = _options.verboseAllocation;
-    AllocateTensors(_ops, schedule, _arch->FeatureMapMemory(), TensorAllocator::HillClimb, AlignmentQuantum, verbose);
+    // If graph input/outputs tensors are in FeatureMap memory, allocate with user-specified tensor alignment
+    AllocateTensors(_ops, schedule, _arch->FeatureMapMemory(), TensorAllocator::HillClimb,
+        _options.separateIORegions ? NPUTensorAlignment : _options.cpuTensorAlignment, verbose);
     if ( _spilling )
     {
         const auto limit = _options.optimizationStagingLimit;
-        AllocateTensors(_ops, schedule, _arch->StagingMemory(), TensorAllocator::HillClimb, AlignmentQuantum, verbose, limit);
+        AllocateTensors(_ops, schedule, _arch->StagingMemory(), TensorAllocator::HillClimb, NPUTensorAlignment, verbose, limit);
 
         return schedule->memoryUsage[_arch->StagingMemory()] <= limit;
     }
@@ -891,7 +893,7 @@ void Scheduler::AllocateReadOnlyAddresses(Schedule *schedule, IncrementalLinearA
 {
     auto lrGraph = ReadOnlyLiveRangeGraph(_arch);
     lrGraph.ExtractLiveRangesFromCascades(_ops, schedule, _arch->ReadonlyMemory(), false);
-    auto totalSize = readOnlyAllocator.Allocate(&lrGraph, AlignmentQuantum, _options.verboseAllocation);
+    auto totalSize = readOnlyAllocator.Allocate(&lrGraph, NPUTensorAlignment, _options.verboseAllocation);
     schedule->memoryUsage[_arch->ReadonlyMemory()] = int(totalSize);
 }
 
@@ -904,8 +906,8 @@ void Scheduler::AllocateIOAddresses(Schedule *schedule, const std::vector<std::u
     {
         assert(_arch->InputFeatureMapMemory() != _arch->OutputFeatureMapMemory());
 
-        AllocateTensors(ops, schedule, _arch->InputFeatureMapMemory(), TensorAllocator::LinearAlloc, AlignmentQuantum, verbose);
-        AllocateTensors(ops, schedule, _arch->OutputFeatureMapMemory(), TensorAllocator::LinearAlloc, AlignmentQuantum, verbose);
+        AllocateTensors(ops, schedule, _arch->InputFeatureMapMemory(), TensorAllocator::LinearAlloc, NPUTensorAlignment, verbose);
+        AllocateTensors(ops, schedule, _arch->OutputFeatureMapMemory(), TensorAllocator::LinearAlloc, NPUTensorAlignment, verbose);
     }
 }
 
@@ -1847,7 +1849,7 @@ void Scheduler::PrintSchedule(Schedule *schedule)
 }
 
 
-void ParseSchedulerOptions(SchedulerOptions &opt, IniReader &reader)
+bool ParseSchedulerOptions(SchedulerOptions &opt, IniReader &reader)
 {
     // Parse debug settings
     std::string key;
@@ -1904,9 +1906,21 @@ void ParseSchedulerOptions(SchedulerOptions &opt, IniReader &reader)
         {
             opt.separateIORegions = reader.Get<bool>();
         }
+        else if ( key == "cpu_tensor_alignment" )
+        {
+            opt.cpuTensorAlignment = reader.Get<int>();
+        }
 
         reader.End();
     }
+
+    if ( opt.cpuTensorAlignment <= 0 || opt.cpuTensorAlignment % NPUTensorAlignment != 0 )
+    {
+        LOG_ERROR("CPU tensor alignment ({}) must be a multiple of {}\n", opt.cpuTensorAlignment, NPUTensorAlignment);
+        return false;
+    }
+
+    return true;
 }
 
 
diff --git a/ethosu/regor/compiler/scheduler.hpp b/ethosu/regor/compiler/scheduler.hpp
index 5747b907..d327003d 100644
--- a/ethosu/regor/compiler/scheduler.hpp
+++ b/ethosu/regor/compiler/scheduler.hpp
@@ -66,6 +66,7 @@ struct SchedulerOptions
     bool verboseAllocation = false;
     Flags<SchedulerFeature> disabled;
     bool separateIORegions = false;
+    int cpuTensorAlignment = 16;
 };
 
 struct WeightScaleEncoding
@@ -370,6 +371,6 @@ private:
         Shape &ofmShape, Flags<WeightFormat> supportedFormats);
 };
 
-void ParseSchedulerOptions(SchedulerOptions &opt, IniReader &reader);
+bool ParseSchedulerOptions(SchedulerOptions &opt, IniReader &reader);
 
 }  // namespace regor
diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py
index 4ef48ce4..686ab4e7 100755
--- a/ethosu/vela/vela.py
+++ b/ethosu/vela/vela.py
@@ -825,6 +825,7 @@ def get_compiler_config(
     disable_buffering: bool,
     cop_format: str,
     separate_io_regions: bool,
+    cpu_tensor_alignment: int,
 ) -> str:
     """Build compiler config file."""
     config = "\n[compiler]\n"
@@ -859,6 +860,7 @@ def get_compiler_config(
     config = config.rstrip("|") + "\n"
     if separate_io_regions:
         config += "separate_io_regions=true\n"
+    config += f"cpu_tensor_alignment={cpu_tensor_alignment}\n"
 
     config += "\n[graph]\n"
     if verbose_graph:
@@ -1063,7 +1065,7 @@ def main(argv: Optional[List[str]] = None) -> int:
         type=int,
         default=Tensor.AllocationQuantum,
         help=(
-            "Controls the allocation byte alignment of cpu tensors including Ethos-U Custom"
+            "Controls the allocation byte alignment of CPU tensors including Ethos-U Custom"
             " operator inputs and outputs (default: %(default)s Bytes)"
         ),
     )
@@ -1228,6 +1230,7 @@ def main(argv: Optional[List[str]] = None) -> int:
             args.disable_buffering,
             args.cop_format,
             args.separate_io_regions,
+            args.cpu_tensor_alignment,
         )
 
         process_regor(
-- 
GitLab