diff --git a/ethosu/regor/compiler/compiler.cpp b/ethosu/regor/compiler/compiler.cpp
index 9cb6c7de65e341186822f17ce4c876bc4e7c0a03..9b972fd5a3aaba38055c6798dabddc5051f8c294 100644
--- a/ethosu/regor/compiler/compiler.cpp
+++ b/ethosu/regor/compiler/compiler.cpp
@@ -178,7 +178,11 @@ bool Compiler::ParseOptions(const char *text, size_t size)
         }
         else if ( section == "scheduler" )
         {
-            ParseSchedulerOptions(_schedulerOptions, reader);
+            if ( !ParseSchedulerOptions(_schedulerOptions, reader) )
+            {
+                SetLastError(fmt::format("Error parsing [{}]", section));
+                return false;
+            }
         }
         else if ( section == "graph" )
         {
diff --git a/ethosu/regor/compiler/scheduler.cpp b/ethosu/regor/compiler/scheduler.cpp
index 926c0232050a5f365eb214607c1c0eaa62ba001d..112b9486a45101d500793310cf21fba038f7db1a 100644
--- a/ethosu/regor/compiler/scheduler.cpp
+++ b/ethosu/regor/compiler/scheduler.cpp
@@ -50,7 +50,7 @@ namespace regor
 {
 
 constexpr int AllocationQuantum = 16;
-constexpr int AlignmentQuantum = 16;
+constexpr int NPUTensorAlignment = 16;
 
 static Shape GetShapeForFormat(const Shape &shape, TensorFormat format)
 {
@@ -828,11 +828,13 @@ void Scheduler::MoveConstantData(Schedule *refSchedule)
 bool Scheduler::AllocateAddresses(Schedule *schedule)
 {
     const auto verbose = _options.verboseAllocation;
-    AllocateTensors(_ops, schedule, _arch->FeatureMapMemory(), TensorAllocator::HillClimb, AlignmentQuantum, verbose);
+    // If graph input/outputs tensors are in FeatureMap memory, allocate with user-specified tensor alignment
+    AllocateTensors(_ops, schedule, _arch->FeatureMapMemory(), TensorAllocator::HillClimb,
+        _options.separateIORegions ? NPUTensorAlignment : _options.cpuTensorAlignment, verbose);
     if ( _spilling )
     {
         const auto limit = _options.optimizationStagingLimit;
-        AllocateTensors(_ops, schedule, _arch->StagingMemory(), TensorAllocator::HillClimb, AlignmentQuantum, verbose, limit);
+        AllocateTensors(_ops, schedule, _arch->StagingMemory(), TensorAllocator::HillClimb, NPUTensorAlignment, verbose, limit);
 
         return schedule->memoryUsage[_arch->StagingMemory()] <= limit;
     }
@@ -891,7 +893,7 @@ void Scheduler::AllocateReadOnlyAddresses(Schedule *schedule, IncrementalLinearA
 {
     auto lrGraph = ReadOnlyLiveRangeGraph(_arch);
     lrGraph.ExtractLiveRangesFromCascades(_ops, schedule, _arch->ReadonlyMemory(), false);
-    auto totalSize = readOnlyAllocator.Allocate(&lrGraph, AlignmentQuantum, _options.verboseAllocation);
+    auto totalSize = readOnlyAllocator.Allocate(&lrGraph, NPUTensorAlignment, _options.verboseAllocation);
     schedule->memoryUsage[_arch->ReadonlyMemory()] = int(totalSize);
 }
 
@@ -904,8 +906,8 @@ void Scheduler::AllocateIOAddresses(Schedule *schedule, const std::vector<std::u
     {
         assert(_arch->InputFeatureMapMemory() != _arch->OutputFeatureMapMemory());
 
-        AllocateTensors(ops, schedule, _arch->InputFeatureMapMemory(), TensorAllocator::LinearAlloc, AlignmentQuantum, verbose);
-        AllocateTensors(ops, schedule, _arch->OutputFeatureMapMemory(), TensorAllocator::LinearAlloc, AlignmentQuantum, verbose);
+        AllocateTensors(ops, schedule, _arch->InputFeatureMapMemory(), TensorAllocator::LinearAlloc, NPUTensorAlignment, verbose);
+        AllocateTensors(ops, schedule, _arch->OutputFeatureMapMemory(), TensorAllocator::LinearAlloc, NPUTensorAlignment, verbose);
     }
 }
 
@@ -1847,7 +1849,7 @@ void Scheduler::PrintSchedule(Schedule *schedule)
 }
 
 
-void ParseSchedulerOptions(SchedulerOptions &opt, IniReader &reader)
+bool ParseSchedulerOptions(SchedulerOptions &opt, IniReader &reader)
 {
     // Parse debug settings
     std::string key;
@@ -1904,9 +1906,21 @@ void ParseSchedulerOptions(SchedulerOptions &opt, IniReader &reader)
         {
             opt.separateIORegions = reader.Get<bool>();
         }
+        else if ( key == "cpu_tensor_alignment" )
+        {
+            opt.cpuTensorAlignment = reader.Get<int>();
+        }
 
         reader.End();
     }
+
+    if ( opt.cpuTensorAlignment <= 0 || opt.cpuTensorAlignment % NPUTensorAlignment != 0 )
+    {
+        LOG_ERROR("CPU tensor alignment ({}) must be a multiple of {}\n", opt.cpuTensorAlignment, NPUTensorAlignment);
+        return false;
+    }
+
+    return true;
 }
 
 
diff --git a/ethosu/regor/compiler/scheduler.hpp b/ethosu/regor/compiler/scheduler.hpp
index 5747b907607ea11e34c48882845173ccbf8e34d8..d327003d10621f4aa991dbb8683abd069d26d716 100644
--- a/ethosu/regor/compiler/scheduler.hpp
+++ b/ethosu/regor/compiler/scheduler.hpp
@@ -66,6 +66,7 @@ struct SchedulerOptions
     bool verboseAllocation = false;
     Flags<SchedulerFeature> disabled;
     bool separateIORegions = false;
+    int cpuTensorAlignment = 16;
 };
 
 struct WeightScaleEncoding
@@ -370,6 +371,6 @@ private:
         Shape &ofmShape, Flags<WeightFormat> supportedFormats);
 };
 
-void ParseSchedulerOptions(SchedulerOptions &opt, IniReader &reader);
+bool ParseSchedulerOptions(SchedulerOptions &opt, IniReader &reader);
 
 }  // namespace regor
diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py
index 4ef48ce402d340f6e3430589c3dc2b5d4608748c..686ab4e76080a5fd17b548b4925b2ffa78bdca02 100755
--- a/ethosu/vela/vela.py
+++ b/ethosu/vela/vela.py
@@ -825,6 +825,7 @@ def get_compiler_config(
     disable_buffering: bool,
     cop_format: str,
     separate_io_regions: bool,
+    cpu_tensor_alignment: int,
 ) -> str:
     """Build compiler config file."""
     config = "\n[compiler]\n"
@@ -859,6 +860,7 @@ def get_compiler_config(
     config = config.rstrip("|") + "\n"
     if separate_io_regions:
         config += "separate_io_regions=true\n"
+    config += f"cpu_tensor_alignment={cpu_tensor_alignment}\n"
 
     config += "\n[graph]\n"
     if verbose_graph:
@@ -1063,7 +1065,7 @@ def main(argv: Optional[List[str]] = None) -> int:
         type=int,
         default=Tensor.AllocationQuantum,
         help=(
-            "Controls the allocation byte alignment of cpu tensors including Ethos-U Custom"
+            "Controls the allocation byte alignment of CPU tensors including Ethos-U Custom"
             " operator inputs and outputs (default: %(default)s Bytes)"
         ),
     )
@@ -1228,6 +1230,7 @@ def main(argv: Optional[List[str]] = None) -> int:
             args.disable_buffering,
             args.cop_format,
             args.separate_io_regions,
+            args.cpu_tensor_alignment,
         )
 
         process_regor(