From 59a7c0016af0e8ff7e3c2c770aef0a983739b5d2 Mon Sep 17 00:00:00 2001 From: William Isaksson Date: Thu, 13 Mar 2025 11:20:49 +0100 Subject: [PATCH] MLBEDSW-9392: Improve Access Cycle Estimation for Ethos-U85 Improves the algorithm for estimating access cycles drastically. Change-Id: I468fb71e373ed6f779eac6370d725466809481f8 Signed-off-by: William Isaksson --- ethosu/regor/architecture/architecture.hpp | 19 +- .../ethosu55/ethos_u55_performance.cpp | 86 ++++++++- .../ethosu55/ethos_u55_performance.hpp | 6 +- .../regor/architecture/ethosu85/ethos_u85.cpp | 11 +- .../regor/architecture/ethosu85/ethos_u85.hpp | 4 +- .../ethosu85/ethos_u85_performance.cpp | 174 +++++++++++++++++- .../ethosu85/ethos_u85_performance.hpp | 19 +- ethosu/regor/compiler/network_performance.cpp | 28 +-- ethosu/regor/compiler/network_performance.hpp | 14 +- ethosu/regor/compiler/scheduler.cpp | 21 ++- 10 files changed, 325 insertions(+), 57 deletions(-) diff --git a/ethosu/regor/architecture/architecture.hpp b/ethosu/regor/architecture/architecture.hpp index 0b28ac01..b5d354aa 100644 --- a/ethosu/regor/architecture/architecture.hpp +++ b/ethosu/regor/architecture/architecture.hpp @@ -244,6 +244,8 @@ struct PerformanceQuery ArchitectureMemory *tmpMemory; unsigned encodedWeightSize; unsigned encodedScaleSize; + ArchitectureMemory *weightStagingMemory; + unsigned firstWeightDMASize; }; struct WeightStats @@ -288,14 +290,12 @@ struct ElementAccess int tmpRead = 0, tmpWrite = 0; }; -enum class MemChannel +struct AccessCycles { - Mem2Mem = 0, - IFMStream = 1, - Weight = 2, - FastWeight = 3, - IFM = 4, - OFM = 5, + int64_t fmAccessCycles = 0; + int64_t weightsAccessCycles = 0; + int64_t scalesAccessCycles = 0; + int64_t totalAccessCycles = 0; }; /// @@ -311,9 +311,12 @@ public: virtual ElementAccess ElementTransferToBytes(const PerformanceQuery &query, const ElementAccess &access) = 0; virtual int64_t WeightDecodeCycles(const PerformanceQuery &query, const WeightStats &weights, Flags format, ArchitectureMemory *weightsMemory) = 0; - virtual float ChannelBW(const ArchitectureMemory *mem, MemChannel channel) = 0; virtual void InitDatabase(Database *db) = 0; virtual void RecordToDB(int opId) = 0; + virtual int64_t MinReadCycles(ArchitectureMemory *mem, int size, TensorUsage usage, OpType type, bool fastWeights) = 0; + virtual int64_t MinWriteCycles(ArchitectureMemory *mem, int size) = 0; + virtual std::unordered_map + MeasureAccessCycles(const PerformanceQuery &query, const ElementAccess &byteAccess) = 0; }; enum class IniParseResult diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp index 9605508a..bd6afa5f 100644 --- a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp @@ -680,12 +680,6 @@ int64_t EthosU55Performance::WeightDecodeCycles( return dmaCycles; } -float EthosU55Performance::ChannelBW(const ArchitectureMemory *mem, const MemChannel channel) -{ - UNUSED(channel); - return mem->Bandwidth(); -} - void EthosU55Performance::InitDatabase(Database *optDB) { _db = optDB; @@ -720,4 +714,84 @@ void EthosU55Performance::RecordToDB(int opId) } } +int64_t EthosU55Performance::MinReadCycles(ArchitectureMemory *mem, int size, TensorUsage usage, OpType type, bool fastWeights) +{ + auto transferCycles = size / double(mem->Bandwidth()); + // Add on latency since this function returns the cycle count for the transfer itself which is not necessarily the + // same as the cycle count that the operation attributes to this transfer. + return transferCycles + mem->ReadLatency(); +} + +int64_t EthosU55Performance::MinWriteCycles(ArchitectureMemory *mem, int size) +{ + auto transferCycles = size / double(mem->Bandwidth()); + return transferCycles + mem->WriteLatency(); +} + +enum class TransferGroup +{ + FeatureMaps, + Weights, + Scales, +}; + +std::unordered_map +EthosU55Performance::MeasureAccessCycles(const PerformanceQuery &query, const ElementAccess &byteAccess) +{ + std::unordered_map memoryAccessCycles; + std::unordered_map> transferBytes; + // IFM + transferBytes[query.ifmMemory[0]][TransferGroup::FeatureMaps] += byteAccess.ifmRead[0]; + // IFM2 + if ( !query.ifmShape[1].IsEmpty() ) + { + transferBytes[query.ifmMemory[1]][TransferGroup::FeatureMaps] += byteAccess.ifmRead[1]; + } + // OFM + transferBytes[query.ofmMemory][TransferGroup::FeatureMaps] += byteAccess.ofmWrite; + + if ( query.constMemory ) + { + // Weights + if ( query.weightStagingMemory ) + { + // Concurrent DMA Weights + auto nonPreBufferedWeightsSize = std::max(int64_t(query.encodedWeightSize) - int64_t(query.firstWeightDMASize), int64_t(0)); + transferBytes[query.constMemory][TransferGroup::Weights] += nonPreBufferedWeightsSize; + transferBytes[query.weightStagingMemory][TransferGroup::Weights] += nonPreBufferedWeightsSize; + transferBytes[query.weightStagingMemory][TransferGroup::Weights] += byteAccess.constRead[0]; + } + else + { + transferBytes[query.constMemory][TransferGroup::Weights] += byteAccess.constRead[0]; + } + // Scales + transferBytes[query.constMemory][TransferGroup::Scales] += byteAccess.constRead[1]; + } + // DMA + if ( query.tmpMemory ) + { + transferBytes[query.tmpMemory][TransferGroup::FeatureMaps] += byteAccess.tmpRead; + transferBytes[query.tmpMemory][TransferGroup::FeatureMaps] += byteAccess.tmpWrite; + } + + for ( auto &[mem, groups] : transferBytes ) + { + AccessCycles accessCycles; + int64_t totalBytes = 0; + for ( auto &[group, bytes] : groups ) + { + totalBytes += bytes; + } + + accessCycles.fmAccessCycles = groups.count(TransferGroup::FeatureMaps) ? groups[TransferGroup::FeatureMaps] / mem->Bandwidth() : 0; + accessCycles.weightsAccessCycles = groups.count(TransferGroup::Weights) ? groups[TransferGroup::Weights] / mem->Bandwidth() : 0; + accessCycles.scalesAccessCycles = groups.count(TransferGroup::Scales) ? groups[TransferGroup::Scales] / mem->Bandwidth() : 0; + accessCycles.totalAccessCycles = totalBytes / mem->Bandwidth(); + memoryAccessCycles[mem] = accessCycles; + } + + return memoryAccessCycles; +} + } // namespace regor diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp index 9eba2bbc..1f7b6e67 100644 --- a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp @@ -72,9 +72,13 @@ public: ElementAccess ElementTransferToBytes(const PerformanceQuery &query, const ElementAccess &access) override; int64_t WeightDecodeCycles(const PerformanceQuery &query, const WeightStats &weights, Flags format, ArchitectureMemory *weightsMemory) override; - float ChannelBW(const ArchitectureMemory *mem, MemChannel channel) override; void InitDatabase(Database *optDB) override; void RecordToDB(int opId) override; + int64_t MinReadCycles(ArchitectureMemory *mem, int size, TensorUsage usage, OpType type, bool fastWeights) override; + int64_t MinWriteCycles(ArchitectureMemory *mem, int size) override; + std::unordered_map + MeasureAccessCycles(const PerformanceQuery &query, const ElementAccess &byteAccess) override; + private: EthosU55Cycles EstimateConvCycles(const PerformanceQuery &query, const std::vector &fused); diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85.cpp index 70e88b1f..a11d2b1e 100644 --- a/ethosu/regor/architecture/ethosu85/ethos_u85.cpp +++ b/ethosu/regor/architecture/ethosu85/ethos_u85.cpp @@ -67,18 +67,19 @@ static const EthosU85PerfInfo s_EthosU85PerfInfo[] = { static const ArchEthosU85::AcceleratorConfig s_EthosU85Configs[] = { // Accelerator.Ethos_U85_128 - {128, 1, {Shape(1, 2, 8), Shape(1, 1, 16)}, Shape(1, 2, 8), 2, 8192, 8192, 2048, 768, 1, 0, {64, 64, 128, 128, 104}, &s_EthosU85PerfInfo[0]}, + {128, 1, {Shape(1, 2, 8), Shape(1, 1, 16)}, Shape(1, 2, 8), 2, 8192, 8192, 2048, 768, 1, 0, {64, 64, 128, 128, 104, 16}, &s_EthosU85PerfInfo[0]}, // Accelerator.Ethos_U85_256 {256, 1, {Shape(1, 2, 16), Shape(1, 4, 8), Shape(2, 2, 8)}, Shape(2, 2, 8), 3, 16384, 16384, 2048, 1536, 1, 0, - {104, 104, 128, 128, 128}, &s_EthosU85PerfInfo[1]}, + {104, 104, 128, 128, 128, 16}, &s_EthosU85PerfInfo[1]}, // Accelerator.Ethos_U85_512 - {512, 2, {Shape(2, 2, 16), Shape(1, 4, 16)}, Shape(2, 2, 16), 2, 16384, 32768, 4096, 3072, 1, 0, {128, 128, 256, 256, 128}, &s_EthosU85PerfInfo[2]}, + {512, 2, {Shape(2, 2, 16), Shape(1, 4, 16)}, Shape(2, 2, 16), 2, 16384, 32768, 4096, 3072, 1, 0, + {128, 128, 256, 256, 128, 16}, &s_EthosU85PerfInfo[2]}, // Accelerator.Ethos_U85_1024 {1024, 4, {Shape(2, 2, 32), Shape(1, 4, 32), Shape(2, 4, 16)}, Shape(4, 2, 16), 3, 16384, 65536, 4096, 6144, 1, 1, - {256, 256, 416, 208, 256}, &s_EthosU85PerfInfo[3]}, + {256, 256, 416, 208, 256, 16}, &s_EthosU85PerfInfo[3]}, // Accelerator.Ethos_U85_2048 {2048, 4, {Shape(2, 2, 64), Shape(1, 4, 64), Shape(4, 4, 16)}, Shape(4, 4, 16), 3, 32768, 131072, 8192, 12288, 2, 1, - {256, 256, 512, 256, 256}, &s_EthosU85PerfInfo[4]}, + {256, 256, 512, 256, 256, 16}, &s_EthosU85PerfInfo[4]}, }; constexpr int CB_SLOTS = 6; diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85.hpp index e6dbef09..5f734239 100644 --- a/ethosu/regor/architecture/ethosu85/ethos_u85.hpp +++ b/ethosu/regor/architecture/ethosu85/ethos_u85.hpp @@ -172,7 +172,7 @@ public: int cbRamSizeBytes; uint8_t numAxiSramLog2; uint8_t numAxiExtLog2; - const std::array channelRBs; + const std::array channelRBs; const EthosU85PerfInfo *perfInfo; }; @@ -196,7 +196,7 @@ private: int _accRamSizeBytes = 0; int _numAxiSramLog2 = 0; int _numAxiExtLog2 = 0; - const std::array *_channelRBs{}; + const std::array *_channelRBs{}; protected: std::unique_ptr _weightEncoder; diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp index c1007272..0c7c0909 100644 --- a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp +++ b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp @@ -129,8 +129,8 @@ int64_t EthosU85Performance::MemToMemCycles(const ArchitectureMemory *dest, cons int64_t fromCycles = int64_t(float(sizeBytes) / ChannelBW(source, MemChannel::Mem2Mem)); fromCycles += source->ReadLatency(); // TODO: Below shouldn't use the OFM channel. See MLBEDSW-9384. - int64_t toCycles = int64_t(float(sizeBytes) / ChannelBW(dest, MemChannel::OFM)); - toCycles += source->WriteLatency(); + int64_t toCycles = int64_t(float(sizeBytes) / ChannelBW(dest, MemChannel::Write)); + toCycles += dest->WriteLatency(); return std::max(fromCycles, toCycles); } @@ -629,8 +629,13 @@ float EthosU85Performance::ChannelBW(const ArchitectureMemory *mem, const MemCha float read_rb_lim; int maxOutstanding; int latency; - - if ( channel == MemChannel::OFM ) + if ( channel == MemChannel::None ) + { + latency = mem->ReadLatency(); + maxOutstanding = mem->MaxReads(); + read_rb_lim = std::numeric_limits::max(); + } + else if ( channel == MemChannel::Write ) { maxOutstanding = mem->MaxWrites(); latency = mem->WriteLatency(); @@ -640,7 +645,8 @@ float EthosU85Performance::ChannelBW(const ArchitectureMemory *mem, const MemCha { maxOutstanding = mem->MaxReads(); latency = mem->ReadLatency(); - int channelRB = _arch->_channelRBs->at(static_cast(channel)); + auto channelIdx = std::max(static_cast(channel) - 1, 0); + int channelRB = _arch->_channelRBs->at(channelIdx); read_rb_lim = static_cast(channelRB) / burstLenWords; } @@ -684,4 +690,162 @@ void EthosU85Performance::RecordToDB(int opId) } } +MemChannel EthosU85Performance::LookupChannel(OpType type, TensorUsage usage, bool fastWeights) +{ + if ( usage == TensorUsage::Weights ) + { + if ( fastWeights ) + { + return MemChannel::FastWeight; + } + else + { + return MemChannel::Weight; + } + } + else if ( usage == TensorUsage::Scales ) + { + return MemChannel::Scale; + } + else if ( IsIFM(usage) ) + { + if ( (usage == TensorUsage::IFM1 && type == OpType::MatMul) || type == OpType::Resize || IsElementwise(type) ) + { + return MemChannel::IFMStream; + } + else + { + return MemChannel::IFM; + } + } + else if ( IsOFM(usage) ) + { + return MemChannel::Write; + } + else if ( usage == TensorUsage::Scratch ) + { + return MemChannel::IFMStream; + } + else + { + return MemChannel::None; + } +} + +int64_t EthosU85Performance::MinReadCycles(ArchitectureMemory *mem, int size, TensorUsage usage, OpType type, bool fastWeights) +{ + auto channel = LookupChannel(type, usage, fastWeights); + auto transferCycles = size / double(ChannelBW(mem, channel)); + // Add on latency since this function returns the cycle count for the transfer itself which is not necessarily the + // same as the cycle count that the operation attributes to this transfer. + return transferCycles + mem->ReadLatency(); +} + +int64_t EthosU85Performance::MinWriteCycles(ArchitectureMemory *mem, int size) +{ + auto channel = MemChannel::Write; + auto transferCycles = size / double(ChannelBW(mem, channel)); + // Add on latency since this function returns the cycle count for the transfer itself which is not necessarily the + // same as the cycle count that the operation attributes to this transfer. + return transferCycles + mem->WriteLatency(); +} + +std::unordered_map +EthosU85Performance::MeasureAccessCycles(const PerformanceQuery &query, const ElementAccess &byteAccess) +{ + enum class TransferGroup + { + FeatureMaps, + Weights, + Scales, + }; + std::unordered_map memoryAccessCycles; + std::unordered_map>> channelTransferBytes; + // IFM + auto channel = LookupChannel(query.type, TensorUsage::IFM, false); + channelTransferBytes[query.ifmMemory[0]][channel][TransferGroup::FeatureMaps] += byteAccess.ifmRead[0]; + // IFM2 + if ( !query.ifmShape[1].IsEmpty() ) + { + channel = LookupChannel(query.type, TensorUsage::IFM1, false); + channelTransferBytes[query.ifmMemory[1]][channel][TransferGroup::FeatureMaps] += byteAccess.ifmRead[1]; + } + // OFM + channelTransferBytes[query.ofmMemory][MemChannel::Write][TransferGroup::FeatureMaps] += byteAccess.ofmWrite; + + if ( query.constMemory ) + { + // Weights + channel = LookupChannel(query.type, TensorUsage::Weights, query.weightFormat & WeightFormat::Fast); + if ( query.weightStagingMemory ) + { + // Concurrent DMA Weights + auto nonPreBufferedWeightsSize = std::max(int64_t(query.encodedWeightSize) - int64_t(query.firstWeightDMASize), int64_t(0)); + channelTransferBytes[query.constMemory][MemChannel::Mem2Mem][TransferGroup::Weights] += nonPreBufferedWeightsSize; + channelTransferBytes[query.weightStagingMemory][MemChannel::Write][TransferGroup::Weights] += nonPreBufferedWeightsSize; + channelTransferBytes[query.weightStagingMemory][channel][TransferGroup::Weights] += byteAccess.constRead[0]; + } + else + { + channelTransferBytes[query.constMemory][MemChannel::Weight][TransferGroup::Weights] += byteAccess.constRead[0]; + } + // Scales + channel = LookupChannel(query.type, TensorUsage::Scales, false); + channelTransferBytes[query.constMemory][channel][TransferGroup::Scales] += byteAccess.constRead[1]; + } + // DMA + if ( query.tmpMemory ) + { + channel = LookupChannel(query.type, TensorUsage::Scratch, false); + channelTransferBytes[query.tmpMemory][channel][TransferGroup::FeatureMaps] += byteAccess.tmpRead; + channelTransferBytes[query.tmpMemory][MemChannel::Write][TransferGroup::FeatureMaps] += byteAccess.tmpWrite; + } + + // Total access cycles for any grouping: + // Group access cycles = max(group read + group write/mem bw, max group channel cycles) + // Where group channel cycles is the channel transfer cycles attributable to that group. + for ( auto &[mem, channels] : channelTransferBytes ) + { + AccessCycles accessCycles; + + int64_t maxChannelCycles = 0; + std::unordered_map maxGroupChannelCycles; + int64_t totalBytes = 0; + std::unordered_map totalGroupBytes; + + for ( auto &[memChannel, groups] : channels ) + { + int64_t channelCycles = 0; + for ( auto &[group, bytes] : groups ) + { + int64_t cycles = bytes / ChannelBW(mem, memChannel); + if ( cycles > maxGroupChannelCycles[group] ) + { + maxGroupChannelCycles[group] = cycles; + } + totalGroupBytes[group] += bytes; + totalBytes += bytes; + channelCycles += cycles; + } + maxChannelCycles = std::max(maxChannelCycles, channelCycles); + } + + accessCycles.fmAccessCycles = + totalGroupBytes.count(TransferGroup::FeatureMaps) ? + std::max(int64_t(totalGroupBytes[TransferGroup::FeatureMaps] / mem->Bandwidth()), maxGroupChannelCycles[TransferGroup::FeatureMaps]) : + 0; + accessCycles.weightsAccessCycles = + totalGroupBytes.count(TransferGroup::Weights) ? + std::max(int64_t(totalGroupBytes[TransferGroup::Weights] / mem->Bandwidth()), maxGroupChannelCycles[TransferGroup::Weights]) : + 0; + accessCycles.scalesAccessCycles = + totalGroupBytes.count(TransferGroup::Scales) ? + std::max(int64_t(totalGroupBytes[TransferGroup::Scales] / mem->Bandwidth()), maxGroupChannelCycles[TransferGroup::Scales]) : + 0; + accessCycles.totalAccessCycles = std::max(int64_t(totalBytes / mem->Bandwidth()), maxChannelCycles); + memoryAccessCycles[mem] = accessCycles; + } + return memoryAccessCycles; +} + } // namespace regor diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp index 5831023a..a4770348 100644 --- a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp +++ b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp @@ -33,6 +33,18 @@ struct EthosU85PerfInfo float activationCycles[3]; }; +enum class MemChannel +{ + None = 0, + Mem2Mem, + IFMStream, + Weight, + FastWeight, + IFM, + Scale, + Write, +}; + struct EthosU85Cycles { int64_t cycles = 0; @@ -72,15 +84,20 @@ public: ElementAccess ElementTransferToBytes(const PerformanceQuery &query, const ElementAccess &access) override; int64_t WeightDecodeCycles(const PerformanceQuery &query, const WeightStats &weights, Flags format, ArchitectureMemory *weightsMemory) override; - float ChannelBW(const ArchitectureMemory *mem, MemChannel channel) override; void InitDatabase(Database *optDB) override; void RecordToDB(int opId) override; + int64_t MinReadCycles(ArchitectureMemory *mem, int size, TensorUsage usage, OpType type, bool fastWeights) override; + int64_t MinWriteCycles(ArchitectureMemory *mem, int size) override; + std::unordered_map + MeasureAccessCycles(const PerformanceQuery &query, const ElementAccess &byteAccess) override; private: EthosU85Cycles EstimateConvCycles(const PerformanceQuery &query, const std::vector &fused); EthosU85ElementCycles EstimateOutputCyclesPerElement(const PerformanceQuery &query, const std::vector &fused); EthosU85Cycles EstimateElementwiseCycles(const PerformanceQuery &query, const std::vector &fused); int64_t EstimateMinimumMemoryCycles(const PerformanceQuery &query); + float ChannelBW(const ArchitectureMemory *mem, MemChannel channel); + static MemChannel LookupChannel(OpType type, TensorUsage usage, bool fastWeights); }; } // namespace regor diff --git a/ethosu/regor/compiler/network_performance.cpp b/ethosu/regor/compiler/network_performance.cpp index 67b5047c..1f2db8e9 100644 --- a/ethosu/regor/compiler/network_performance.cpp +++ b/ethosu/regor/compiler/network_performance.cpp @@ -287,7 +287,7 @@ void NetworkPerformance::AddToDatabase(const PerformanceResult &perf, SchedulerO for ( const auto mem : memories ) { - row.push_back(std::to_string(perf.memory.at(mem).AccessCycles())); + row.push_back(std::to_string(perf.memory.at(mem).accessCycles)); } db->AddRow(opTable, schedOp->Uid(), std::move(row)); @@ -436,7 +436,7 @@ PerformanceResult NetworkPerformance::EstimateFullOpPerformance( ElementAccess access = _arch->Performance()->MeasureElementAccess(query); ElementAccess byteAccess = _arch->Performance()->ElementTransferToBytes(query, access); - + auto memoryAccessCycles = _arch->Performance()->MeasureAccessCycles(query, byteAccess); // How many NPU cycles are available under the previously executing // operator for performing buffered DMA transfers int64_t slackCycles = (prevCost != nullptr) ? prevCost->slackBufferingCycles : 0; @@ -458,6 +458,9 @@ PerformanceResult NetworkPerformance::EstimateFullOpPerformance( result.memory[srcMemory].access[AccessType::Lut].bytesRead += copySize; result.memory[dstMemory].access[AccessType::Lut].bytesWritten += copySize; + // TODO: Add lut transfers through MeasureAccessCycles() instead + result.memory[srcMemory].access[AccessType::Lut].accessCycles += copySize / srcMemory->Bandwidth(); + result.memory[dstMemory].access[AccessType::Lut].accessCycles += copySize / dstMemory->Bandwidth(); } } @@ -543,28 +546,26 @@ PerformanceResult NetworkPerformance::EstimateFullOpPerformance( result.memory[weightsMemory].access[AccessType::Scales].bytesRead += byteAccess.constRead[1]; } + for ( auto &[mem, accessCycles] : memoryAccessCycles ) + { + assert(result.memory.count(mem) > 0); + result.memory[mem].accessCycles = accessCycles.totalAccessCycles; + result.memory[mem].access[AccessType::FeatureMap].accessCycles = accessCycles.fmAccessCycles; + result.memory[mem].access[AccessType::Weights].accessCycles = accessCycles.weightsAccessCycles; + result.memory[mem].access[AccessType::Scales].accessCycles = accessCycles.scalesAccessCycles; + } + // Update memory-access cycles and find the maximum memory read cycle time int64_t maxMemCycles = 0; for ( auto &[mem, stats] : result.memory ) { int64_t totalReadBytes = 0; int64_t totalWriteBytes = 0; - float bandwidth = mem->Bandwidth(); - int64_t memBytes = 0; for ( auto &[accType, acc] : stats.access ) { - // compute cycles per accessType - int64_t bytes = acc.bytesRead + acc.bytesWritten; - memBytes += bytes; - int64_t accCycles = int64_t(float(bytes) / bandwidth); - acc.accessCycles = accCycles; totalReadBytes += acc.bytesRead; totalWriteBytes += acc.bytesWritten; } - // get maximum cycles per memory - int64_t memCycles = int64_t(float(memBytes) / bandwidth); - maxMemCycles = std::max(maxMemCycles, memCycles); - if ( totalReadBytes > 0 ) { stats.readTransferEff = float(totalReadBytes - stats.readTransferOverhead) / totalReadBytes; @@ -573,6 +574,7 @@ PerformanceResult NetworkPerformance::EstimateFullOpPerformance( { stats.writeTransferEff = float(totalWriteBytes - stats.writeTransferOverhead) / totalWriteBytes; } + maxMemCycles = std::max(maxMemCycles, stats.accessCycles); } result.totalCycles = std::max(result.npuCycles, maxMemCycles); diff --git a/ethosu/regor/compiler/network_performance.hpp b/ethosu/regor/compiler/network_performance.hpp index 588eb06d..b379e0e1 100644 --- a/ethosu/regor/compiler/network_performance.hpp +++ b/ethosu/regor/compiler/network_performance.hpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2021-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -67,15 +67,7 @@ struct PerformanceResult int64_t writeTransferOverhead = 0; float readTransferEff = 1; float writeTransferEff = 1; - int64_t AccessCycles() const - { - int64_t cycles = 0; - for ( const auto &[type, acc] : access ) - { - cycles += acc.accessCycles; - } - return cycles; - } + int64_t accessCycles = 0; MemoryAccesses &operator+=(const MemoryAccesses &other) { @@ -88,7 +80,7 @@ struct PerformanceResult } }; - std::unordered_map memory; + std::unordered_map memory; int64_t npuCycles = 0; int64_t cpuCycles = 0; int64_t totalCycles = 0; diff --git a/ethosu/regor/compiler/scheduler.cpp b/ethosu/regor/compiler/scheduler.cpp index 6558ea5e..c0509540 100644 --- a/ethosu/regor/compiler/scheduler.cpp +++ b/ethosu/regor/compiler/scheduler.cpp @@ -1089,12 +1089,17 @@ void Scheduler::ProposeWeightBuffering(SchedulerConnection *weights, SchedulerCo forceFullDepthSlice = FulldepthWeightBuffering(_ops, weightTens, schedOp, cost, prevOp, prevCost, refSchedule); } + // Estimate the buffering cycle time for the full set of weights + int64_t fullTransferCycles = _arch->Performance()->MemToMemCycles(_arch->StagingMemory().memory, weightTens->memArea.memory, fullWeightsBytes); + + if ( _spilling && !forceFullDepthSlice ) { // To be refined and architecture specific depending on mem2mem characteristics and prebuffering float bwRatio = std::round( - _arch->Performance()->ChannelBW(weightTens->memArea.memory, MemChannel::Weight) / - _arch->Performance()->ChannelBW(weightTens->memArea.memory, MemChannel::Mem2Mem)); + fullTransferCycles / + _arch->Performance()->MinReadCycles(weightTens->memArea.memory, fullWeightsBytes, TensorUsage::Weights, + schedOp->Type(), weightFormat % WeightFormat::Fast)); needsDMA = (cost->elementAccess.weightsRefetch > 2) || (cost->elementAccess.weightsRefetch == 2 && bwRatio < 2); } @@ -1132,9 +1137,6 @@ void Scheduler::ProposeWeightBuffering(SchedulerConnection *weights, SchedulerCo } else { - // Estimate the buffering cycle time for the full set of weights - int64_t fullTransferCycles = _arch->Performance()->MemToMemCycles( - _arch->StagingMemory().memory, weightTens->memArea.memory, fullWeightsBytes); cost->fullWeightTransferCycles = fullTransferCycles; // Calculate the amount of pre-buffering necessary (or what is possible with limited @@ -1716,6 +1718,15 @@ PerformanceQuery Scheduler::InitPerfQuery( query.encodedWeightSize = unsigned(weightBytes * ratio); query.encodedScaleSize = unsigned(scaleBytes * ratio); query.constMemory = cost->npuWeightsTensor->memArea.memory; + if ( cost->bufferedWeightTensor.tensor ) + { + query.weightStagingMemory = cost->bufferedWeightTensor.tensor->memArea.memory; + if ( cost->bufferedWeightTensor.preBuffer ) + { + auto preBufferRatio = float(cost->ofmDepthSlices[1]) / cost->ofmDepthSlices.back(); + query.firstWeightDMASize = query.encodedWeightSize * preBufferRatio; + } + } } query.weightFormat = wgtFormat; -- GitLab