From a3a1df53e0c861ab2e3d85d14e6d118d58d211e0 Mon Sep 17 00:00:00 2001 From: Philip Hall Date: Wed, 5 Mar 2025 12:47:17 +0000 Subject: [PATCH] MLBEDSW-10106: Update Ethos-U55 MatMul performance stats - Updated performance calculations for the Ethos-U55 MatMul implementation. This is required to maintain the Ethos-U55/Ethos-U85 abstraction (both must return a result) when using the performance interface. - Fixed incomplete implementation of encoded weights byte transfer values. - Replaced manual datatype related scaling to use the DataType scaling functions. Signed-off-by: Philip Hall Change-Id: I7c8deb4e2740518874530786481d4ef57822bac4 --- ethosu/regor/architecture/architecture.hpp | 3 + .../ethosu55/ethos_u55_performance.cpp | 113 ++++++++++++++---- .../ethosu55/ethos_u55_performance.hpp | 5 +- .../ethos_u55_register_cs_generator.cpp | 2 +- .../ethosu85/ethos_u85_performance.cpp | 55 +++++---- .../ethosu85/ethos_u85_performance.hpp | 4 +- ethosu/regor/compiler/network_performance.cpp | 34 +++--- ethosu/regor/compiler/scheduler.cpp | 25 +++- ethosu/regor/compiler/scheduler.hpp | 2 +- ethosu/regor/compiler/scheduler_packing.cpp | 2 +- 10 files changed, 174 insertions(+), 71 deletions(-) diff --git a/ethosu/regor/architecture/architecture.hpp b/ethosu/regor/architecture/architecture.hpp index 993df631..5967770e 100644 --- a/ethosu/regor/architecture/architecture.hpp +++ b/ethosu/regor/architecture/architecture.hpp @@ -243,6 +243,9 @@ struct PerformanceQuery Shape constShape; ArchitectureMemory *constMemory; WeightFormat weightFormat; + ArchitectureMemory *tmpMemory; + unsigned encodedWeightSize; + unsigned encodedScaleSize; }; struct WeightStats diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp index 82aa1ddd..26c83483 100644 --- a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp @@ -61,34 +61,19 @@ CycleCost EthosU55Performance::MeasureCycleCost(const PerformanceQuery &query, c auto npuOp = _arch->GetHWOp(query.type); const bool recordToDb = _db && _nextId != -1; - // Convolution/Vector product cycle calculation if ( OpUsesMacs(npuOp) ) { - if ( (npuOp == EthosU55NpuOp::Depthwise) || (npuOp == EthosU55NpuOp::Pooling) ) - { - cycles.macs = int64_t(query.kernel->ElementsWH()) * query.ofmShape.Elements() * 1; - } - else - { - cycles.macs = int64_t(query.kernel->ElementsWH()) * query.ofmShape.Elements() * query.ifmShape[0].Depth(); - } - cycleComponents = EstimateConvCycles(query, fused); + cycles.macs = cycleComponents.macs; cycles.opCycles = cycleComponents.cycles; } // Elementwise cycle calculation else if ( npuOp == EthosU55NpuOp::Elementwise ) { - auto [totCCPerElem, aoCCPerElem, cmdCCPerElem] = EstimateOutputCyclesPerElement(query, fused); - auto ofmShape = - (query.ofmFormat == TensorFormat::NHCWB16) ? Shape::RoundAway(query.ofmShape, Shape(1, 1, 1, 16)) : query.ofmShape; - cycles.opCycles = int64_t(totCCPerElem * float(ofmShape.Elements())); - if ( recordToDb ) - { - cycleComponents.aoCycles = int64_t(aoCCPerElem * float(ofmShape.Elements())); - cycleComponents.cmdCycles = int64_t(cmdCCPerElem * float(ofmShape.Elements())); - } + cycleComponents = EstimateElementwiseCycles(query, fused); + cycles.macs = cycleComponents.macs; + cycles.opCycles = cycleComponents.cycles; } else if ( npuOp == EthosU55NpuOp::Dma ) { @@ -97,9 +82,25 @@ CycleCost EthosU55Performance::MeasureCycleCost(const PerformanceQuery &query, c } else if ( npuOp == EthosU55NpuOp::Compound ) { - // TODO: Measure variable-implementation ops assert(query.type == OpType::Transpose || query.type == OpType::MatMul); - cycles.opCycles = EstimateMinimumMemoryCycles(query); + if ( query.type == OpType::MatMul ) + { + cycleComponents = EstimateMatMulCycles(query, fused); + cycles.macs = cycleComponents.macs; + cycles.opCycles = cycleComponents.cycles; + } + else + { + // TODO: Measure variable-implementation ops + // (default estimation based on memory access) + ElementAccess estimate = MeasureElementAccess(query); + estimate = ElementTransferToBytes(query, estimate); + assert(query.ifmMemory[0] && query.ofmMemory); + int64_t fromCycles = + int64_t(float(estimate.ifmRead[0]) / query.ifmMemory[0]->Bandwidth()) + query.ifmMemory[0]->ReadLatency(); + int64_t toCycles = int64_t(float(estimate.ofmWrite) / query.ofmMemory->Bandwidth()) + query.ofmMemory->WriteLatency(); + cycles.opCycles = std::max(fromCycles, toCycles); + } } else { @@ -131,7 +132,6 @@ CycleCost EthosU55Performance::MeasureCycleCost(const PerformanceQuery &query, c _nextId = -1; } - return cycles; } @@ -308,7 +308,63 @@ EthosU55Cycles EthosU55Performance::EstimateConvCycles(const PerformanceQuery &q cmdCycles = cmdCycles * numOfmBlks + cyclesDpuBlk; } - return {totalCycles, cyclesDpu, cyclesAO, cmdCycles}; + int64_t totalMacs = int64_t(query.kernel->ElementsWH()) * query.ofmShape.Elements(); + if ( (npuOp != EthosU55NpuOp::Depthwise) && (npuOp != EthosU55NpuOp::Pooling) ) + { + totalMacs *= query.ifmShape[0].Depth(); + } + + return {totalCycles, cyclesDpu, cyclesAO, cmdCycles, totalMacs}; +} + +EthosU55Cycles EthosU55Performance::EstimateElementwiseCycles(const PerformanceQuery &query, const std::vector &fused) +{ + auto [totCCPerElem, aoCCPerElem, cmdCCPerElem] = EstimateOutputCyclesPerElement(query, fused); + auto ofmShape = + (query.ofmFormat == TensorFormat::NHCWB16) ? Shape::RoundAway(query.ofmShape, Shape(1, 1, 1, 16)) : query.ofmShape; + float elements = float(ofmShape.Elements64()); + EthosU55Cycles cycleComponents{}; + cycleComponents.cycles = int64_t(totCCPerElem * elements); + cycleComponents.aoCycles = int64_t(aoCCPerElem * elements); + cycleComponents.cmdCycles = int64_t(cmdCCPerElem * elements); + return cycleComponents; +} + +EthosU55Cycles EthosU55Performance::EstimateMatMulCycles(const PerformanceQuery &query, const std::vector &fused) +{ + // Query the cost of individual parts of the matmul implementation + EthosU55OpConfig *config = static_cast(query.config); + PerformanceQuery subQuery = query; + + // Mul cost + subQuery.type = OpType::Mul; + subQuery.config = config->PrevConfig(); + subQuery.ofmShape = query.ifmShape[0]; + subQuery.ifmShape[1] = query.ifmShape[0]; + subQuery.ofmType = DataType::Int32; + subQuery.ofmMemory = query.tmpMemory; + EthosU55Cycles mulCost = EstimateElementwiseCycles(subQuery, fused); + + // ReduceSum cost + subQuery.type = OpType::ReduceSum; + subQuery.config = config; + subQuery.ifmShape[1] = Shape(); + subQuery.ifmMemory[0] = query.tmpMemory; + subQuery.ofmShape = subQuery.ofmShape.WithDepth(1); + subQuery.ofmType = query.ofmType; + subQuery.ofmMemory = query.ofmMemory; + EthosU55Cycles sumCost = EstimateConvCycles(subQuery, fused); + + // Repeat for every column of the ofm + int cols = query.ifmShape[1].Width(); + EthosU55Cycles cycles{}; + cycles.macs = (mulCost.macs + sumCost.macs) * cols; + cycles.cycles = (mulCost.cycles + sumCost.cycles) * cols; + cycles.aoCycles = (mulCost.aoCycles + sumCost.aoCycles) * cols; + cycles.cmdCycles = (mulCost.cmdCycles + sumCost.cmdCycles) * cols; + cycles.macCycles = (mulCost.macCycles + sumCost.macCycles) * cols; + + return cycles; } static int EstimateMemoryTransfer(int cores, bool isRead, ArchitectureMemory *memory, TensorFormat format, @@ -603,9 +659,14 @@ ElementAccess EthosU55Performance::ElementTransferToBytes(const PerformanceQuery result.ofmWrite = EstimateMemoryTransfer(_arch->_cores, false, query.ofmMemory, query.ofmFormat, DataTypeSizeBits(query.ofmType), opConfig->OfmBlock(), query.ofmShape, access.ofmWrite); - // These requires compression ratio information - result.constRead[0] = 0; - result.constRead[1] = 0; + // Use encoded information from query to estimate weight reads if present + result.constRead[0] = result.constRead[1] = 0; + if ( query.encodedWeightSize ) + { + result.constRead[0] = access.weightsRefetch * query.encodedWeightSize; + result.constRead[1] = access.weightsRefetch * query.encodedScaleSize; + result.weightsRefetch = 1; + } return result; } diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp index c3b8d2a5..6d50c1d0 100644 --- a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2021-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -39,6 +39,7 @@ struct EthosU55Cycles int64_t macCycles; int64_t aoCycles; int64_t cmdCycles; + int64_t macs; }; struct EthosU55ElementCycles @@ -78,6 +79,8 @@ public: private: EthosU55Cycles EstimateConvCycles(const PerformanceQuery &query, const std::vector &fused); + EthosU55Cycles EstimateElementwiseCycles(const PerformanceQuery &query, const std::vector &fused); + EthosU55Cycles EstimateMatMulCycles(const PerformanceQuery &query, const std::vector &fused); EthosU55ElementCycles EstimateOutputCyclesPerElement(const PerformanceQuery &query, const std::vector &fused); int64_t EstimateMinimumMemoryCycles(const PerformanceQuery &query); }; diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp index 736b129e..d1942579 100644 --- a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp @@ -1465,7 +1465,7 @@ void EthosU55RCSGenerator::InsertTransposeCommand(const HLCStripe *stripe, Tempo query.ofmShape = outFM.shape; query.ofmFormat = TensorFormat::NHWC; query.transpose = ofm.transpose; - temps.configs.push_back(_arch->FindBlockConfig(cmd->operation->type, query)); + temps.configs.push_back(_arch->GetOpConfig(cmd->operation->type, query)); } cmd->operation->config = temps.configs.back().get(); // Add to CMD list diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp index 302f70b5..2519623d 100644 --- a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp +++ b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp @@ -72,32 +72,16 @@ CycleCost EthosU85Performance::MeasureCycleCost(const PerformanceQuery &query, c // Convolution/Vector product cycle calculation if ( OpUsesMacs(npuOp) ) { - if ( npuOp == EthosU85NpuOp::Depthwise || npuOp == EthosU85NpuOp::Pooling || - npuOp == EthosU85NpuOp::ReduceMinMax || npuOp == EthosU85NpuOp::ArgMax ) - { - cycles.macs = int64_t(query.kernel->ElementsWH()) * query.ofmShape.Elements(); - } - else - { - cycles.macs = int64_t(query.kernel->ElementsWH()) * query.ofmShape.Elements() * query.ifmShape[0].Depth(); - } - cycles.macs /= sparse ? 2 : 1; - cycleComponents = EstimateConvCycles(query, fused); + cycles.macs /= sparse ? 2 : 1; cycles.opCycles = cycleComponents.cycles; } // Elementwise cycle calculation else if ( npuOp == EthosU85NpuOp::Elementwise ) { - auto [totCCPerElem, aoCCPerElem, cmdCCPerElem] = EstimateOutputCyclesPerElement(query, fused); - auto ofmShape = - (query.ofmFormat == TensorFormat::NHCWB16) ? Shape::RoundAway(query.ofmShape, Shape(1, 1, 1, 16)) : query.ofmShape; - cycles.opCycles = int64_t(totCCPerElem * float(ofmShape.Elements())); - if ( recordToDb ) - { - cycleComponents.aoCycles = int64_t(aoCCPerElem * float(ofmShape.Elements())); - cycleComponents.cmdCycles = int64_t(cmdCCPerElem * float(ofmShape.Elements())); - } + cycleComponents = EstimateElementwiseCycles(query, fused); + cycles.macs = cycleComponents.macs; + cycles.opCycles = cycleComponents.cycles; } // Resize cycle calculation else if ( npuOp == EthosU85NpuOp::Resize ) @@ -298,7 +282,25 @@ EthosU85Cycles EthosU85Performance::EstimateConvCycles(const PerformanceQuery &q cmdCycles = cmdCycles * numOfmBlks + cyclesDpuBlk; } - return {totalCycles, cyclesDpu, cyclesAO, cmdCycles}; + int64_t totalMacs = int64_t(query.kernel->ElementsWH()) * query.ofmShape.Elements(); + if ( npuOp == EthosU85NpuOp::Depthwise || npuOp == EthosU85NpuOp::Pooling || npuOp == EthosU85NpuOp::ReduceMinMax || npuOp == EthosU85NpuOp::ArgMax ) + { + totalMacs *= query.ifmShape[0].Depth(); + } + return {totalCycles, cyclesDpu, cyclesAO, cmdCycles, totalMacs}; +} + +EthosU85Cycles EthosU85Performance::EstimateElementwiseCycles(const PerformanceQuery &query, const std::vector &fused) +{ + auto [totCCPerElem, aoCCPerElem, cmdCCPerElem] = EstimateOutputCyclesPerElement(query, fused); + auto ofmShape = + (query.ofmFormat == TensorFormat::NHCWB16) ? Shape::RoundAway(query.ofmShape, Shape(1, 1, 1, 16)) : query.ofmShape; + float elements = float(ofmShape.Elements64()); + EthosU85Cycles cycleComponents{}; + cycleComponents.cycles = int64_t(totCCPerElem * elements); + cycleComponents.aoCycles = int64_t(aoCCPerElem * elements); + cycleComponents.cmdCycles = int64_t(cmdCCPerElem * elements); + return cycleComponents; } static int64_t EstimateMemoryTransfer(int cores, bool isRead, ArchitectureMemory *memory, TensorFormat format, @@ -585,9 +587,14 @@ ElementAccess EthosU85Performance::ElementTransferToBytes(const PerformanceQuery result.ofmWrite = EstimateMemoryTransfer(_arch->_cores, false, query.ofmMemory, query.ofmFormat, DataTypeSizeBits(query.ofmType), ofmBlock, query.ofmShape, access.ofmWrite); - // These requires compression ratio information - result.constRead[0] = 0; - result.constRead[1] = 0; + // Use encoded information from query to estimate weight reads if present + result.constRead[0] = result.constRead[1] = 0; + if ( query.encodedWeightSize ) + { + result.constRead[0] = access.weightsRefetch * query.encodedWeightSize; + result.constRead[1] = access.weightsRefetch * query.encodedScaleSize; + result.weightsRefetch = 1; + } return result; } diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp index a048e99e..72cdd42f 100644 --- a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp +++ b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2021-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -39,6 +39,7 @@ struct EthosU85Cycles int64_t macCycles = 0; int64_t aoCycles = 0; int64_t cmdCycles = 0; + int64_t macs = 0; }; struct EthosU85ElementCycles @@ -79,6 +80,7 @@ public: private: EthosU85Cycles EstimateConvCycles(const PerformanceQuery &query, const std::vector &fused); EthosU85ElementCycles EstimateOutputCyclesPerElement(const PerformanceQuery &query, const std::vector &fused); + EthosU85Cycles EstimateElementwiseCycles(const PerformanceQuery &query, const std::vector &fused); int64_t EstimateMinimumMemoryCycles(const PerformanceQuery &query); }; diff --git a/ethosu/regor/compiler/network_performance.cpp b/ethosu/regor/compiler/network_performance.cpp index dc88940d..b0995346 100644 --- a/ethosu/regor/compiler/network_performance.cpp +++ b/ethosu/regor/compiler/network_performance.cpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2021-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -402,7 +402,7 @@ PerformanceResult NetworkPerformance::EstimateFullOpPerformance( { UNUSED(prevOp); auto wgtFormat = cost->npuWeightsTensor ? cost->npuWeightsTensor->config->Format() : Flags(WeightFormat::Default); - PerformanceQuery query = Scheduler::InitPerfQuery(schedOp, cost->Config(), -1, wgtFormat); + PerformanceQuery query = Scheduler::InitPerfQuery(schedOp, cost->Config(), -1, wgtFormat, cost); std::vector fused = Scheduler::InitFusionQuery(schedOp); // Memory that NPU will source weights from for operations @@ -503,35 +503,41 @@ PerformanceResult NetworkPerformance::EstimateFullOpPerformance( auto ofm = schedOp->OFM(); result.memory[ofm->tensor->memArea.memory].access[AccessType::FeatureMap].bytesWritten += byteAccess.ofmWrite; result.memory[ofm->tensor->memArea.memory] - .writeTransferOverhead += byteAccess.ofmWrite - DataTypeSizeBits(ofm->tensor->dataType) / 8 * access.ofmWrite; + .writeTransferOverhead += byteAccess.ofmWrite - DataTypeStorageSizeBytes(ofm->tensor->dataType, access.ofmWrite); // IFM1 read auto ifm = schedOp->IFM(0); result.memory[ifm->tensor->memArea.memory].access[AccessType::FeatureMap].bytesRead += byteAccess.ifmRead[0]; result.memory[ifm->tensor->memArea.memory] - .readTransferOverhead += byteAccess.ifmRead[0] - DataTypeSizeBits(ifm->tensor->dataType) / 8 * access.ifmRead[0]; + .readTransferOverhead += byteAccess.ifmRead[0] - DataTypeStorageSizeBytes(ifm->tensor->dataType, access.ifmRead[0]); // IFM2 read auto ifm2 = schedOp->TryIFM(1); if ( ifm2 ) { result.memory[ifm2->tensor->memArea.memory].access[AccessType::FeatureMap].bytesRead += byteAccess.ifmRead[1]; - result.memory[ifm2->tensor->memArea.memory] - .readTransferOverhead += byteAccess.ifmRead[1] - DataTypeSizeBits(ifm2->tensor->dataType) / 8 * access.ifmRead[1]; + result.memory[ifm2->tensor->memArea.memory].readTransferOverhead += + byteAccess.ifmRead[1] - DataTypeStorageSizeBytes(ifm2->tensor->dataType, access.ifmRead[1]); } - // Weight read - if ( cost->npuWeightsTensor && access.constRead[0] > 0 ) + // Reads/writes to temporary or intermediate memories + auto scratch = schedOp->TryInput(TensorUsage::Scratch); + if ( scratch ) { - int encodedWeightsSize = cost->npuWeightsTensor->totalWeightBytes; - result.memory[weightsMemory].access[AccessType::Weights].bytesRead += int64_t(encodedWeightsSize) * access.weightsRefetch; + result.memory[scratch->tensor->memArea.memory].access[AccessType::FeatureMap].bytesRead += byteAccess.tmpRead; + result.memory[scratch->tensor->memArea.memory] + .readTransferOverhead += byteAccess.tmpRead - DataTypeStorageSizeBytes(scratch->tensor->dataType, access.tmpRead); + + result.memory[scratch->tensor->memArea.memory].access[AccessType::FeatureMap].bytesWritten += byteAccess.tmpWrite; + result.memory[scratch->tensor->memArea.memory].readTransferOverhead += + byteAccess.tmpWrite - DataTypeStorageSizeBytes(scratch->tensor->dataType, access.tmpWrite); } - // Scale read - if ( cost->npuWeightsTensor && access.constRead[1] > 0 ) + // Weight/scale reads + if ( cost->npuWeightsTensor ) { - int encodedScaleSize = cost->npuWeightsTensor->AllocationSizeBytes() - cost->npuWeightsTensor->totalWeightBytes; - result.memory[weightsMemory].access[AccessType::Scales].bytesRead += int64_t(encodedScaleSize) * access.weightsRefetch; + result.memory[weightsMemory].access[AccessType::Weights].bytesRead += byteAccess.constRead[0]; + result.memory[weightsMemory].access[AccessType::Scales].bytesRead += byteAccess.constRead[1]; } // Update memory-access cycles and find the maximum memory read cycle time diff --git a/ethosu/regor/compiler/scheduler.cpp b/ethosu/regor/compiler/scheduler.cpp index 06d007a9..e2cd127a 100644 --- a/ethosu/regor/compiler/scheduler.cpp +++ b/ethosu/regor/compiler/scheduler.cpp @@ -1644,7 +1644,8 @@ void Scheduler::CoalesceWeightBufferTensors(Schedule *schedule) } -PerformanceQuery Scheduler::InitPerfQuery(SchedulerOperation *op, ArchitectureOpConfig *config, int ofmDepth, WeightFormat wgtFormat) +PerformanceQuery Scheduler::InitPerfQuery( + SchedulerOperation *op, ArchitectureOpConfig *config, int ofmDepth, WeightFormat wgtFormat, SchedulerOpInfo *cost) { PerformanceQuery query = {}; query.type = op->Type(); @@ -1667,11 +1668,18 @@ PerformanceQuery Scheduler::InitPerfQuery(SchedulerOperation *op, ArchitectureOp } SchedulerConnection *ofm = op->OFM(); - query.ofmShape = (ofmDepth >= 0) ? ofm->SliceShape().WithDepth(ofmDepth) : ofm->SliceShape(); + ofmDepth = (ofmDepth >= 0) ? ofmDepth : ofm->SliceShape().Depth(); + query.ofmShape = ofm->SliceShape().WithDepth(ofmDepth); query.ofmMemory = ofm->tensor->memArea.memory; query.ofmType = ofm->tensor->dataType; query.ofmFormat = ofm->tensor->format; + SchedulerConnection *scratch = op->TryInput(TensorUsage::Scratch); + if ( scratch ) + { + query.tmpMemory = scratch->tensor->memArea.memory; + } + SchedulerConnection *scales = op->TryInput(TensorUsage::Scales); if ( scales ) { @@ -1679,6 +1687,19 @@ PerformanceQuery Scheduler::InitPerfQuery(SchedulerOperation *op, ArchitectureOp query.constMemory = scales->tensor->memArea.memory; } + // If post-schedule cost is available, update with encoded sizes + if ( cost && cost->npuWeightsTensor ) + { + float ratio = float(ofmDepth) / ofm->SliceShape().Depth(); + unsigned weightBytes = cost->npuWeightsTensor->totalWeightBytes; + unsigned scaleBytes = cost->npuWeightsTensor->AllocationSizeBytes() - weightBytes; + + // Encoded weight and scale sizes, estimated as a proportion if sliced. + query.encodedWeightSize = unsigned(weightBytes * ratio); + query.encodedScaleSize = unsigned(scaleBytes * ratio); + query.constMemory = cost->npuWeightsTensor->memArea.memory; + } + query.weightFormat = wgtFormat; return query; diff --git a/ethosu/regor/compiler/scheduler.hpp b/ethosu/regor/compiler/scheduler.hpp index d9313f3f..c137540b 100644 --- a/ethosu/regor/compiler/scheduler.hpp +++ b/ethosu/regor/compiler/scheduler.hpp @@ -303,7 +303,7 @@ public: void AllocateReadOnlyAddresses(Schedule *schedule, IncrementalLinearAllocator &readOnlyAllocator); static PerformanceQuery InitPerfQuery(SchedulerOperation *op, ArchitectureOpConfig *config, int ofm_depth = -1, - WeightFormat wgtFormat = WeightFormat::Default); + WeightFormat wgtFormat = WeightFormat::Default, SchedulerOpInfo *cost = nullptr); static std::vector InitFusionQuery(SchedulerOperation *op); private: diff --git a/ethosu/regor/compiler/scheduler_packing.cpp b/ethosu/regor/compiler/scheduler_packing.cpp index 215caec3..0007053b 100644 --- a/ethosu/regor/compiler/scheduler_packing.cpp +++ b/ethosu/regor/compiler/scheduler_packing.cpp @@ -558,7 +558,7 @@ std::unique_ptr SchedulerPacking::MakeSchedulerOperation(Ope auto scratchTensor = std::make_shared(req.scratch.type, req.scratch.size, req.scratch.format); SchedulerConnection *scratchConn = schedOp->AddInput(TensorUsage::Scratch0, scratchTensor); scratchConn->shape = req.scratch.size; - scratchTensor->memArea = _arch->StagingMemory(); + scratchTensor->memArea = _arch->FeatureMapMemory(); } } -- GitLab