diff --git a/ethosu/regor/compiler/compiler.cpp b/ethosu/regor/compiler/compiler.cpp index 3927855b4ccd75acb21958354f83c8c68be8dd04..9cb6c7de65e341186822f17ce4c876bc4e7c0a03 100644 --- a/ethosu/regor/compiler/compiler.cpp +++ b/ethosu/regor/compiler/compiler.cpp @@ -512,23 +512,6 @@ std::unique_ptr Compiler::CompileGraph(std::unique_ptr &graph, return nullptr; } - // At most 1 CustomNpuOp is supported when compiling with separate IO regions - if ( _schedulerOptions.separateIORegions ) - { - std::vector ops; - newGraph->GetAllOperations(ops); - int customNpuOps = 0; - for ( auto op : ops ) - { - if ( op->Type() == OpType::CustomNpuOp ) customNpuOps++; - } - if ( customNpuOps > 1 ) - { - SetLastError("More than 1 CustomNpuOp is not supported with separate IO regions"); - return nullptr; - } - } - auto customOperatorBuilder = CustomOperatorBuilder(_architecture.get(), schedule.get()); customOperatorBuilder.AllocateScratchTensors(tensorAddressMap); @@ -538,6 +521,9 @@ std::unique_ptr Compiler::CompileGraph(std::unique_ptr &graph, auto *graphOp = pair.first; const auto *npuOp = pair.second.get(); + // Allocate addresses for IO tensors with an address space that is local for this sequence of NPU ops + scheduler.AllocateIOAddresses(schedule.get(), npuOp->Operations()); + // Generate HLCS auto hlcsGenerator = HLCStreamGenerator(); auto highLevelCommandStream = hlcsGenerator.GenerateCommandStream(npuOp, schedule.get(), _compilerOptions.verboseHighLevelCommandStream); diff --git a/ethosu/regor/compiler/scheduler.cpp b/ethosu/regor/compiler/scheduler.cpp index 6a69f55984d2dec20f3bea5549f1a6a4da7b5b5f..926c0232050a5f365eb214607c1c0eaa62ba001d 100644 --- a/ethosu/regor/compiler/scheduler.cpp +++ b/ethosu/regor/compiler/scheduler.cpp @@ -329,7 +329,8 @@ int Scheduler::UpdateSchedulerTensor(TensorUsage usage, SchedulerConnection *con } // Initial criteria (may change) - bool cpuTensor = tensor->hasCPUWriters || tensor->hasCPUReaders || tensor->isGraphInput || tensor->isGraphOutput; + bool cpuTensor = + tensor->hasCPUWriters || tensor->hasCPUReaders || tensor->isGraphInput || tensor->isGraphOutput || tensor->isPersistent; conn->requireFullTensor = conn->requireFullTensor || cpuTensor; tensor->needsLinearFormat = tensor->needsLinearFormat || cpuTensor || CheckLinearFormatForConcatSplit(tensor); @@ -337,13 +338,9 @@ int Scheduler::UpdateSchedulerTensor(TensorUsage usage, SchedulerConnection *con { tensor->memArea = _arch->CPUMemory(); } - else if ( _options.separateIORegions && !tensor->IsConstant() && cpuTensor && tensor->hasNPUReaders ) + else if ( _options.separateIORegions && !tensor->IsConstant() && cpuTensor ) { - tensor->memArea = _arch->InputFeatureMapMemory(); - } - else if ( _options.separateIORegions && !tensor->IsConstant() && cpuTensor && tensor->hasNPUWriters ) - { - tensor->memArea = _arch->OutputFeatureMapMemory(); + tensor->memArea = tensor->hasNPUWriters ? _arch->OutputFeatureMapMemory() : _arch->InputFeatureMapMemory(); } // Set tensor format to NHCWB16 for FeatureMaps, if possible @@ -831,13 +828,7 @@ void Scheduler::MoveConstantData(Schedule *refSchedule) bool Scheduler::AllocateAddresses(Schedule *schedule) { const auto verbose = _options.verboseAllocation; - const auto separateIORegions = _options.separateIORegions; AllocateTensors(_ops, schedule, _arch->FeatureMapMemory(), TensorAllocator::HillClimb, AlignmentQuantum, verbose); - if ( separateIORegions ) - { - AllocateTensors(_ops, schedule, _arch->InputFeatureMapMemory(), TensorAllocator::LinearAlloc, AlignmentQuantum, verbose); - AllocateTensors(_ops, schedule, _arch->OutputFeatureMapMemory(), TensorAllocator::LinearAlloc, AlignmentQuantum, verbose); - } if ( _spilling ) { const auto limit = _options.optimizationStagingLimit; @@ -905,6 +896,20 @@ void Scheduler::AllocateReadOnlyAddresses(Schedule *schedule, IncrementalLinearA } +void Scheduler::AllocateIOAddresses(Schedule *schedule, const std::vector> &ops) +{ + const auto verbose = _options.verboseAllocation; + const auto separateIORegions = _options.separateIORegions; + if ( separateIORegions ) + { + assert(_arch->InputFeatureMapMemory() != _arch->OutputFeatureMapMemory()); + + AllocateTensors(ops, schedule, _arch->InputFeatureMapMemory(), TensorAllocator::LinearAlloc, AlignmentQuantum, verbose); + AllocateTensors(ops, schedule, _arch->OutputFeatureMapMemory(), TensorAllocator::LinearAlloc, AlignmentQuantum, verbose); + } +} + + void Scheduler::UpdateOpMemorySnapshot(Schedule *schedule) { const auto fastStorage = _arch->StagingMemory(); diff --git a/ethosu/regor/compiler/scheduler.hpp b/ethosu/regor/compiler/scheduler.hpp index c137540bbab189d0cbb782fda0bdded765e57332..5747b907607ea11e34c48882845173ccbf8e34d8 100644 --- a/ethosu/regor/compiler/scheduler.hpp +++ b/ethosu/regor/compiler/scheduler.hpp @@ -302,6 +302,8 @@ public: void AllocateReadOnlyAddresses(Schedule *schedule, IncrementalLinearAllocator &readOnlyAllocator); + void AllocateIOAddresses(Schedule *schedule, const std::vector> &ops); + static PerformanceQuery InitPerfQuery(SchedulerOperation *op, ArchitectureOpConfig *config, int ofm_depth = -1, WeightFormat wgtFormat = WeightFormat::Default, SchedulerOpInfo *cost = nullptr); static std::vector InitFusionQuery(SchedulerOperation *op); diff --git a/ethosu/regor/compiler/scheduler_operation.hpp b/ethosu/regor/compiler/scheduler_operation.hpp index 4cb6456b94e2d0037e91fab45a342616c684b078..391ee668635143f5ac5f8a9ab3df0d20d2074790 100644 --- a/ethosu/regor/compiler/scheduler_operation.hpp +++ b/ethosu/regor/compiler/scheduler_operation.hpp @@ -92,7 +92,7 @@ public: void SetAddress(Address address) { - assert(allocatedAddress == -1 && address >= 0); + assert(address >= 0); allocatedAddress = address; }