From 5235deaa78da83f6d344a4883b54cc29e85176e7 Mon Sep 17 00:00:00 2001 From: Johan Gunnarsson Date: Wed, 26 Mar 2025 18:14:03 +0100 Subject: [PATCH] MLBEDSW-9531: Use tensor alignment flag in Regor This flag is used when allocating scratch memory and not using separate IO regions. Read only memory and staging memory is never accessed from CPU keep using the default NPU alignment for those tensors. Signed-off-by: Johan Gunnarsson Change-Id: I0977e671b1b2948781e5dd52ba686019b1ffa921 --- ethosu/regor/compiler/compiler.cpp | 6 +++++- ethosu/regor/compiler/scheduler.cpp | 28 +++++++++++++++++++++------- ethosu/regor/compiler/scheduler.hpp | 3 ++- ethosu/vela/vela.py | 5 ++++- 4 files changed, 32 insertions(+), 10 deletions(-) diff --git a/ethosu/regor/compiler/compiler.cpp b/ethosu/regor/compiler/compiler.cpp index 9cb6c7de..9b972fd5 100644 --- a/ethosu/regor/compiler/compiler.cpp +++ b/ethosu/regor/compiler/compiler.cpp @@ -178,7 +178,11 @@ bool Compiler::ParseOptions(const char *text, size_t size) } else if ( section == "scheduler" ) { - ParseSchedulerOptions(_schedulerOptions, reader); + if ( !ParseSchedulerOptions(_schedulerOptions, reader) ) + { + SetLastError(fmt::format("Error parsing [{}]", section)); + return false; + } } else if ( section == "graph" ) { diff --git a/ethosu/regor/compiler/scheduler.cpp b/ethosu/regor/compiler/scheduler.cpp index 926c0232..112b9486 100644 --- a/ethosu/regor/compiler/scheduler.cpp +++ b/ethosu/regor/compiler/scheduler.cpp @@ -50,7 +50,7 @@ namespace regor { constexpr int AllocationQuantum = 16; -constexpr int AlignmentQuantum = 16; +constexpr int NPUTensorAlignment = 16; static Shape GetShapeForFormat(const Shape &shape, TensorFormat format) { @@ -828,11 +828,13 @@ void Scheduler::MoveConstantData(Schedule *refSchedule) bool Scheduler::AllocateAddresses(Schedule *schedule) { const auto verbose = _options.verboseAllocation; - AllocateTensors(_ops, schedule, _arch->FeatureMapMemory(), TensorAllocator::HillClimb, AlignmentQuantum, verbose); + // If graph input/outputs tensors are in FeatureMap memory, allocate with user-specified tensor alignment + AllocateTensors(_ops, schedule, _arch->FeatureMapMemory(), TensorAllocator::HillClimb, + _options.separateIORegions ? NPUTensorAlignment : _options.cpuTensorAlignment, verbose); if ( _spilling ) { const auto limit = _options.optimizationStagingLimit; - AllocateTensors(_ops, schedule, _arch->StagingMemory(), TensorAllocator::HillClimb, AlignmentQuantum, verbose, limit); + AllocateTensors(_ops, schedule, _arch->StagingMemory(), TensorAllocator::HillClimb, NPUTensorAlignment, verbose, limit); return schedule->memoryUsage[_arch->StagingMemory()] <= limit; } @@ -891,7 +893,7 @@ void Scheduler::AllocateReadOnlyAddresses(Schedule *schedule, IncrementalLinearA { auto lrGraph = ReadOnlyLiveRangeGraph(_arch); lrGraph.ExtractLiveRangesFromCascades(_ops, schedule, _arch->ReadonlyMemory(), false); - auto totalSize = readOnlyAllocator.Allocate(&lrGraph, AlignmentQuantum, _options.verboseAllocation); + auto totalSize = readOnlyAllocator.Allocate(&lrGraph, NPUTensorAlignment, _options.verboseAllocation); schedule->memoryUsage[_arch->ReadonlyMemory()] = int(totalSize); } @@ -904,8 +906,8 @@ void Scheduler::AllocateIOAddresses(Schedule *schedule, const std::vectorInputFeatureMapMemory() != _arch->OutputFeatureMapMemory()); - AllocateTensors(ops, schedule, _arch->InputFeatureMapMemory(), TensorAllocator::LinearAlloc, AlignmentQuantum, verbose); - AllocateTensors(ops, schedule, _arch->OutputFeatureMapMemory(), TensorAllocator::LinearAlloc, AlignmentQuantum, verbose); + AllocateTensors(ops, schedule, _arch->InputFeatureMapMemory(), TensorAllocator::LinearAlloc, NPUTensorAlignment, verbose); + AllocateTensors(ops, schedule, _arch->OutputFeatureMapMemory(), TensorAllocator::LinearAlloc, NPUTensorAlignment, verbose); } } @@ -1847,7 +1849,7 @@ void Scheduler::PrintSchedule(Schedule *schedule) } -void ParseSchedulerOptions(SchedulerOptions &opt, IniReader &reader) +bool ParseSchedulerOptions(SchedulerOptions &opt, IniReader &reader) { // Parse debug settings std::string key; @@ -1904,9 +1906,21 @@ void ParseSchedulerOptions(SchedulerOptions &opt, IniReader &reader) { opt.separateIORegions = reader.Get(); } + else if ( key == "cpu_tensor_alignment" ) + { + opt.cpuTensorAlignment = reader.Get(); + } reader.End(); } + + if ( opt.cpuTensorAlignment <= 0 || opt.cpuTensorAlignment % NPUTensorAlignment != 0 ) + { + LOG_ERROR("CPU tensor alignment ({}) must be a multiple of {}\n", opt.cpuTensorAlignment, NPUTensorAlignment); + return false; + } + + return true; } diff --git a/ethosu/regor/compiler/scheduler.hpp b/ethosu/regor/compiler/scheduler.hpp index 5747b907..d327003d 100644 --- a/ethosu/regor/compiler/scheduler.hpp +++ b/ethosu/regor/compiler/scheduler.hpp @@ -66,6 +66,7 @@ struct SchedulerOptions bool verboseAllocation = false; Flags disabled; bool separateIORegions = false; + int cpuTensorAlignment = 16; }; struct WeightScaleEncoding @@ -370,6 +371,6 @@ private: Shape &ofmShape, Flags supportedFormats); }; -void ParseSchedulerOptions(SchedulerOptions &opt, IniReader &reader); +bool ParseSchedulerOptions(SchedulerOptions &opt, IniReader &reader); } // namespace regor diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py index 4ef48ce4..686ab4e7 100755 --- a/ethosu/vela/vela.py +++ b/ethosu/vela/vela.py @@ -825,6 +825,7 @@ def get_compiler_config( disable_buffering: bool, cop_format: str, separate_io_regions: bool, + cpu_tensor_alignment: int, ) -> str: """Build compiler config file.""" config = "\n[compiler]\n" @@ -859,6 +860,7 @@ def get_compiler_config( config = config.rstrip("|") + "\n" if separate_io_regions: config += "separate_io_regions=true\n" + config += f"cpu_tensor_alignment={cpu_tensor_alignment}\n" config += "\n[graph]\n" if verbose_graph: @@ -1063,7 +1065,7 @@ def main(argv: Optional[List[str]] = None) -> int: type=int, default=Tensor.AllocationQuantum, help=( - "Controls the allocation byte alignment of cpu tensors including Ethos-U Custom" + "Controls the allocation byte alignment of CPU tensors including Ethos-U Custom" " operator inputs and outputs (default: %(default)s Bytes)" ), ) @@ -1228,6 +1230,7 @@ def main(argv: Optional[List[str]] = None) -> int: args.disable_buffering, args.cop_format, args.separate_io_regions, + args.cpu_tensor_alignment, ) process_regor( -- GitLab