From 59a7c0016af0e8ff7e3c2c770aef0a983739b5d2 Mon Sep 17 00:00:00 2001
From: William Isaksson <william.isaksson@arm.com>
Date: Thu, 13 Mar 2025 11:20:49 +0100
Subject: [PATCH] MLBEDSW-9392: Improve Access Cycle Estimation for Ethos-U85

Improves the algorithm for estimating access cycles drastically.

Change-Id: I468fb71e373ed6f779eac6370d725466809481f8
Signed-off-by: William Isaksson <william.isaksson@arm.com>
---
 ethosu/regor/architecture/architecture.hpp    |  19 +-
 .../ethosu55/ethos_u55_performance.cpp        |  86 ++++++++-
 .../ethosu55/ethos_u55_performance.hpp        |   6 +-
 .../regor/architecture/ethosu85/ethos_u85.cpp |  11 +-
 .../regor/architecture/ethosu85/ethos_u85.hpp |   4 +-
 .../ethosu85/ethos_u85_performance.cpp        | 174 +++++++++++++++++-
 .../ethosu85/ethos_u85_performance.hpp        |  19 +-
 ethosu/regor/compiler/network_performance.cpp |  28 +--
 ethosu/regor/compiler/network_performance.hpp |  14 +-
 ethosu/regor/compiler/scheduler.cpp           |  21 ++-
 10 files changed, 325 insertions(+), 57 deletions(-)
diff --git a/ethosu/regor/architecture/architecture.hpp b/ethosu/regor/architecture/architecture.hpp
index 0b28ac01..b5d354aa 100644
--- a/ethosu/regor/architecture/architecture.hpp
+++ b/ethosu/regor/architecture/architecture.hpp
@@ -244,6 +244,8 @@ struct PerformanceQuery
     ArchitectureMemory *tmpMemory;
     unsigned encodedWeightSize;
     unsigned encodedScaleSize;
+    ArchitectureMemory *weightStagingMemory;
+    unsigned firstWeightDMASize;
 };
 
 struct WeightStats
@@ -288,14 +290,12 @@ struct ElementAccess
     int tmpRead = 0, tmpWrite = 0;
 };
 
-enum class MemChannel
+struct AccessCycles
 {
-    Mem2Mem = 0,
-    IFMStream = 1,
-    Weight = 2,
-    FastWeight = 3,
-    IFM = 4,
-    OFM = 5,
+    int64_t fmAccessCycles = 0;
+    int64_t weightsAccessCycles = 0;
+    int64_t scalesAccessCycles = 0;
+    int64_t totalAccessCycles = 0;
 };
 
 /// <summary>
@@ -311,9 +311,12 @@ public:
     virtual ElementAccess ElementTransferToBytes(const PerformanceQuery &query, const ElementAccess &access) = 0;
     virtual int64_t WeightDecodeCycles(const PerformanceQuery &query, const WeightStats &weights,
         Flags<WeightFormat> format, ArchitectureMemory *weightsMemory) = 0;
-    virtual float ChannelBW(const ArchitectureMemory *mem, MemChannel channel) = 0;
     virtual void InitDatabase(Database *db) = 0;
     virtual void RecordToDB(int opId) = 0;
+    virtual int64_t MinReadCycles(ArchitectureMemory *mem, int size, TensorUsage usage, OpType type, bool fastWeights) = 0;
+    virtual int64_t MinWriteCycles(ArchitectureMemory *mem, int size) = 0;
+    virtual std::unordered_map<const ArchitectureMemory *, AccessCycles>
+    MeasureAccessCycles(const PerformanceQuery &query, const ElementAccess &byteAccess) = 0;
 };
 
 enum class IniParseResult
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp
index 9605508a..bd6afa5f 100644
--- a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp
@@ -680,12 +680,6 @@ int64_t EthosU55Performance::WeightDecodeCycles(
     return dmaCycles;
 }
 
-float EthosU55Performance::ChannelBW(const ArchitectureMemory *mem, const MemChannel channel)
-{
-    UNUSED(channel);
-    return mem->Bandwidth();
-}
-
 void EthosU55Performance::InitDatabase(Database *optDB)
 {
     _db = optDB;
@@ -720,4 +714,84 @@ void EthosU55Performance::RecordToDB(int opId)
     }
 }
 
+int64_t EthosU55Performance::MinReadCycles(ArchitectureMemory *mem, int size, TensorUsage usage, OpType type, bool fastWeights)
+{
+    auto transferCycles = size / double(mem->Bandwidth());
+    // Add on latency since this function returns the cycle count for the transfer itself which is not necessarily the
+    // same as the cycle count that the operation attributes to this transfer.
+    return transferCycles + mem->ReadLatency();
+}
+
+int64_t EthosU55Performance::MinWriteCycles(ArchitectureMemory *mem, int size)
+{
+    auto transferCycles = size / double(mem->Bandwidth());
+    return transferCycles + mem->WriteLatency();
+}
+
+enum class TransferGroup
+{
+    FeatureMaps,
+    Weights,
+    Scales,
+};
+
+std::unordered_map<const ArchitectureMemory *, AccessCycles>
+EthosU55Performance::MeasureAccessCycles(const PerformanceQuery &query, const ElementAccess &byteAccess)
+{
+    std::unordered_map<const ArchitectureMemory *, AccessCycles> memoryAccessCycles;
+    std::unordered_map<const ArchitectureMemory *, std::unordered_map<TransferGroup, int64_t>> transferBytes;
+    // IFM
+    transferBytes[query.ifmMemory[0]][TransferGroup::FeatureMaps] += byteAccess.ifmRead[0];
+    // IFM2
+    if ( !query.ifmShape[1].IsEmpty() )
+    {
+        transferBytes[query.ifmMemory[1]][TransferGroup::FeatureMaps] += byteAccess.ifmRead[1];
+    }
+    // OFM
+    transferBytes[query.ofmMemory][TransferGroup::FeatureMaps] += byteAccess.ofmWrite;
+
+    if ( query.constMemory )
+    {
+        // Weights
+        if ( query.weightStagingMemory )
+        {
+            // Concurrent DMA Weights
+            auto nonPreBufferedWeightsSize = std::max(int64_t(query.encodedWeightSize) - int64_t(query.firstWeightDMASize), int64_t(0));
+            transferBytes[query.constMemory][TransferGroup::Weights] += nonPreBufferedWeightsSize;
+            transferBytes[query.weightStagingMemory][TransferGroup::Weights] += nonPreBufferedWeightsSize;
+            transferBytes[query.weightStagingMemory][TransferGroup::Weights] += byteAccess.constRead[0];
+        }
+        else
+        {
+            transferBytes[query.constMemory][TransferGroup::Weights] += byteAccess.constRead[0];
+        }
+        // Scales
+        transferBytes[query.constMemory][TransferGroup::Scales] += byteAccess.constRead[1];
+    }
+    // DMA
+    if ( query.tmpMemory )
+    {
+        transferBytes[query.tmpMemory][TransferGroup::FeatureMaps] += byteAccess.tmpRead;
+        transferBytes[query.tmpMemory][TransferGroup::FeatureMaps] += byteAccess.tmpWrite;
+    }
+
+    for ( auto &[mem, groups] : transferBytes )
+    {
+        AccessCycles accessCycles;
+        int64_t totalBytes = 0;
+        for ( auto &[group, bytes] : groups )
+        {
+            totalBytes += bytes;
+        }
+
+        accessCycles.fmAccessCycles = groups.count(TransferGroup::FeatureMaps) ? groups[TransferGroup::FeatureMaps] / mem->Bandwidth() : 0;
+        accessCycles.weightsAccessCycles = groups.count(TransferGroup::Weights) ? groups[TransferGroup::Weights] / mem->Bandwidth() : 0;
+        accessCycles.scalesAccessCycles = groups.count(TransferGroup::Scales) ? groups[TransferGroup::Scales] / mem->Bandwidth() : 0;
+        accessCycles.totalAccessCycles = totalBytes / mem->Bandwidth();
+        memoryAccessCycles[mem] = accessCycles;
+    }
+
+    return memoryAccessCycles;
+}
+
 }  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp
index 9eba2bbc..1f7b6e67 100644
--- a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp
@@ -72,9 +72,13 @@ public:
     ElementAccess ElementTransferToBytes(const PerformanceQuery &query, const ElementAccess &access) override;
     int64_t WeightDecodeCycles(const PerformanceQuery &query, const WeightStats &weights, Flags<WeightFormat> format,
         ArchitectureMemory *weightsMemory) override;
-    float ChannelBW(const ArchitectureMemory *mem, MemChannel channel) override;
     void InitDatabase(Database *optDB) override;
     void RecordToDB(int opId) override;
+    int64_t MinReadCycles(ArchitectureMemory *mem, int size, TensorUsage usage, OpType type, bool fastWeights) override;
+    int64_t MinWriteCycles(ArchitectureMemory *mem, int size) override;
+    std::unordered_map<const ArchitectureMemory *, AccessCycles>
+    MeasureAccessCycles(const PerformanceQuery &query, const ElementAccess &byteAccess) override;
+
 
 private:
     EthosU55Cycles EstimateConvCycles(const PerformanceQuery &query, const std::vector<FusionQuery> &fused);
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85.cpp
index 70e88b1f..a11d2b1e 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85.cpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85.cpp
@@ -67,18 +67,19 @@ static const EthosU85PerfInfo s_EthosU85PerfInfo[] = {
 
 static const ArchEthosU85::AcceleratorConfig s_EthosU85Configs[] = {
     // Accelerator.Ethos_U85_128
-    {128, 1, {Shape(1, 2, 8), Shape(1, 1, 16)}, Shape(1, 2, 8), 2, 8192, 8192, 2048, 768, 1, 0, {64, 64, 128, 128, 104}, &s_EthosU85PerfInfo[0]},
+    {128, 1, {Shape(1, 2, 8), Shape(1, 1, 16)}, Shape(1, 2, 8), 2, 8192, 8192, 2048, 768, 1, 0, {64, 64, 128, 128, 104, 16}, &s_EthosU85PerfInfo[0]},
     // Accelerator.Ethos_U85_256
     {256, 1, {Shape(1, 2, 16), Shape(1, 4, 8), Shape(2, 2, 8)}, Shape(2, 2, 8), 3, 16384, 16384, 2048, 1536, 1, 0,
-        {104, 104, 128, 128, 128}, &s_EthosU85PerfInfo[1]},
+        {104, 104, 128, 128, 128, 16}, &s_EthosU85PerfInfo[1]},
     // Accelerator.Ethos_U85_512
-    {512, 2, {Shape(2, 2, 16), Shape(1, 4, 16)}, Shape(2, 2, 16), 2, 16384, 32768, 4096, 3072, 1, 0, {128, 128, 256, 256, 128}, &s_EthosU85PerfInfo[2]},
+    {512, 2, {Shape(2, 2, 16), Shape(1, 4, 16)}, Shape(2, 2, 16), 2, 16384, 32768, 4096, 3072, 1, 0,
+        {128, 128, 256, 256, 128, 16}, &s_EthosU85PerfInfo[2]},
     // Accelerator.Ethos_U85_1024
     {1024, 4, {Shape(2, 2, 32), Shape(1, 4, 32), Shape(2, 4, 16)}, Shape(4, 2, 16), 3, 16384, 65536, 4096, 6144, 1, 1,
-        {256, 256, 416, 208, 256}, &s_EthosU85PerfInfo[3]},
+        {256, 256, 416, 208, 256, 16}, &s_EthosU85PerfInfo[3]},
     // Accelerator.Ethos_U85_2048
     {2048, 4, {Shape(2, 2, 64), Shape(1, 4, 64), Shape(4, 4, 16)}, Shape(4, 4, 16), 3, 32768, 131072, 8192, 12288, 2, 1,
-        {256, 256, 512, 256, 256}, &s_EthosU85PerfInfo[4]},
+        {256, 256, 512, 256, 256, 16}, &s_EthosU85PerfInfo[4]},
 };
 
 constexpr int CB_SLOTS = 6;
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85.hpp
index e6dbef09..5f734239 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85.hpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85.hpp
@@ -172,7 +172,7 @@ public:
         int cbRamSizeBytes;
         uint8_t numAxiSramLog2;
         uint8_t numAxiExtLog2;
-        const std::array<int, 5> channelRBs;
+        const std::array<int, 6> channelRBs;
         const EthosU85PerfInfo *perfInfo;
     };
 
@@ -196,7 +196,7 @@ private:
     int _accRamSizeBytes = 0;
     int _numAxiSramLog2 = 0;
     int _numAxiExtLog2 = 0;
-    const std::array<int, 5> *_channelRBs{};
+    const std::array<int, 6> *_channelRBs{};
 
 protected:
     std::unique_ptr<class WeightEncoder> _weightEncoder;
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp
index c1007272..0c7c0909 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp
@@ -129,8 +129,8 @@ int64_t EthosU85Performance::MemToMemCycles(const ArchitectureMemory *dest, cons
     int64_t fromCycles = int64_t(float(sizeBytes) / ChannelBW(source, MemChannel::Mem2Mem));
     fromCycles += source->ReadLatency();
     // TODO: Below shouldn't use the OFM channel. See MLBEDSW-9384.
-    int64_t toCycles = int64_t(float(sizeBytes) / ChannelBW(dest, MemChannel::OFM));
-    toCycles += source->WriteLatency();
+    int64_t toCycles = int64_t(float(sizeBytes) / ChannelBW(dest, MemChannel::Write));
+    toCycles += dest->WriteLatency();
     return std::max(fromCycles, toCycles);
 }
 
@@ -629,8 +629,13 @@ float EthosU85Performance::ChannelBW(const ArchitectureMemory *mem, const MemCha
     float read_rb_lim;
     int maxOutstanding;
     int latency;
-
-    if ( channel == MemChannel::OFM )
+    if ( channel == MemChannel::None )
+    {
+        latency = mem->ReadLatency();
+        maxOutstanding = mem->MaxReads();
+        read_rb_lim = std::numeric_limits<float>::max();
+    }
+    else if ( channel == MemChannel::Write )
     {
         maxOutstanding = mem->MaxWrites();
         latency = mem->WriteLatency();
@@ -640,7 +645,8 @@ float EthosU85Performance::ChannelBW(const ArchitectureMemory *mem, const MemCha
     {
         maxOutstanding = mem->MaxReads();
         latency = mem->ReadLatency();
-        int channelRB = _arch->_channelRBs->at(static_cast<int>(channel));
+        auto channelIdx = std::max(static_cast<int>(channel) - 1, 0);
+        int channelRB = _arch->_channelRBs->at(channelIdx);
         read_rb_lim = static_cast<float>(channelRB) / burstLenWords;
     }
 
@@ -684,4 +690,162 @@ void EthosU85Performance::RecordToDB(int opId)
     }
 }
 
+MemChannel EthosU85Performance::LookupChannel(OpType type, TensorUsage usage, bool fastWeights)
+{
+    if ( usage == TensorUsage::Weights )
+    {
+        if ( fastWeights )
+        {
+            return MemChannel::FastWeight;
+        }
+        else
+        {
+            return MemChannel::Weight;
+        }
+    }
+    else if ( usage == TensorUsage::Scales )
+    {
+        return MemChannel::Scale;
+    }
+    else if ( IsIFM(usage) )
+    {
+        if ( (usage == TensorUsage::IFM1 && type == OpType::MatMul) || type == OpType::Resize || IsElementwise(type) )
+        {
+            return MemChannel::IFMStream;
+        }
+        else
+        {
+            return MemChannel::IFM;
+        }
+    }
+    else if ( IsOFM(usage) )
+    {
+        return MemChannel::Write;
+    }
+    else if ( usage == TensorUsage::Scratch )
+    {
+        return MemChannel::IFMStream;
+    }
+    else
+    {
+        return MemChannel::None;
+    }
+}
+
+int64_t EthosU85Performance::MinReadCycles(ArchitectureMemory *mem, int size, TensorUsage usage, OpType type, bool fastWeights)
+{
+    auto channel = LookupChannel(type, usage, fastWeights);
+    auto transferCycles = size / double(ChannelBW(mem, channel));
+    // Add on latency since this function returns the cycle count for the transfer itself which is not necessarily the
+    // same as the cycle count that the operation attributes to this transfer.
+    return transferCycles + mem->ReadLatency();
+}
+
+int64_t EthosU85Performance::MinWriteCycles(ArchitectureMemory *mem, int size)
+{
+    auto channel = MemChannel::Write;
+    auto transferCycles = size / double(ChannelBW(mem, channel));
+    // Add on latency since this function returns the cycle count for the transfer itself which is not necessarily the
+    // same as the cycle count that the operation attributes to this transfer.
+    return transferCycles + mem->WriteLatency();
+}
+
+std::unordered_map<const ArchitectureMemory *, AccessCycles>
+EthosU85Performance::MeasureAccessCycles(const PerformanceQuery &query, const ElementAccess &byteAccess)
+{
+    enum class TransferGroup
+    {
+        FeatureMaps,
+        Weights,
+        Scales,
+    };
+    std::unordered_map<const ArchitectureMemory *, AccessCycles> memoryAccessCycles;
+    std::unordered_map<const ArchitectureMemory *, std::unordered_map<MemChannel, std::unordered_map<TransferGroup, int64_t>>> channelTransferBytes;
+    // IFM
+    auto channel = LookupChannel(query.type, TensorUsage::IFM, false);
+    channelTransferBytes[query.ifmMemory[0]][channel][TransferGroup::FeatureMaps] += byteAccess.ifmRead[0];
+    // IFM2
+    if ( !query.ifmShape[1].IsEmpty() )
+    {
+        channel = LookupChannel(query.type, TensorUsage::IFM1, false);
+        channelTransferBytes[query.ifmMemory[1]][channel][TransferGroup::FeatureMaps] += byteAccess.ifmRead[1];
+    }
+    // OFM
+    channelTransferBytes[query.ofmMemory][MemChannel::Write][TransferGroup::FeatureMaps] += byteAccess.ofmWrite;
+
+    if ( query.constMemory )
+    {
+        // Weights
+        channel = LookupChannel(query.type, TensorUsage::Weights, query.weightFormat & WeightFormat::Fast);
+        if ( query.weightStagingMemory )
+        {
+            // Concurrent DMA Weights
+            auto nonPreBufferedWeightsSize = std::max(int64_t(query.encodedWeightSize) - int64_t(query.firstWeightDMASize), int64_t(0));
+            channelTransferBytes[query.constMemory][MemChannel::Mem2Mem][TransferGroup::Weights] += nonPreBufferedWeightsSize;
+            channelTransferBytes[query.weightStagingMemory][MemChannel::Write][TransferGroup::Weights] += nonPreBufferedWeightsSize;
+            channelTransferBytes[query.weightStagingMemory][channel][TransferGroup::Weights] += byteAccess.constRead[0];
+        }
+        else
+        {
+            channelTransferBytes[query.constMemory][MemChannel::Weight][TransferGroup::Weights] += byteAccess.constRead[0];
+        }
+        // Scales
+        channel = LookupChannel(query.type, TensorUsage::Scales, false);
+        channelTransferBytes[query.constMemory][channel][TransferGroup::Scales] += byteAccess.constRead[1];
+    }
+    // DMA
+    if ( query.tmpMemory )
+    {
+        channel = LookupChannel(query.type, TensorUsage::Scratch, false);
+        channelTransferBytes[query.tmpMemory][channel][TransferGroup::FeatureMaps] += byteAccess.tmpRead;
+        channelTransferBytes[query.tmpMemory][MemChannel::Write][TransferGroup::FeatureMaps] += byteAccess.tmpWrite;
+    }
+
+    // Total access cycles for any grouping:
+    // Group access cycles = max(group read + group write/mem bw, max group channel cycles)
+    // Where group channel cycles is the channel transfer cycles attributable to that group.
+    for ( auto &[mem, channels] : channelTransferBytes )
+    {
+        AccessCycles accessCycles;
+
+        int64_t maxChannelCycles = 0;
+        std::unordered_map<TransferGroup, int64_t> maxGroupChannelCycles;
+        int64_t totalBytes = 0;
+        std::unordered_map<TransferGroup, int64_t> totalGroupBytes;
+
+        for ( auto &[memChannel, groups] : channels )
+        {
+            int64_t channelCycles = 0;
+            for ( auto &[group, bytes] : groups )
+            {
+                int64_t cycles = bytes / ChannelBW(mem, memChannel);
+                if ( cycles > maxGroupChannelCycles[group] )
+                {
+                    maxGroupChannelCycles[group] = cycles;
+                }
+                totalGroupBytes[group] += bytes;
+                totalBytes += bytes;
+                channelCycles += cycles;
+            }
+            maxChannelCycles = std::max(maxChannelCycles, channelCycles);
+        }
+
+        accessCycles.fmAccessCycles =
+            totalGroupBytes.count(TransferGroup::FeatureMaps) ?
+                std::max(int64_t(totalGroupBytes[TransferGroup::FeatureMaps] / mem->Bandwidth()), maxGroupChannelCycles[TransferGroup::FeatureMaps]) :
+                0;
+        accessCycles.weightsAccessCycles =
+            totalGroupBytes.count(TransferGroup::Weights) ?
+                std::max(int64_t(totalGroupBytes[TransferGroup::Weights] / mem->Bandwidth()), maxGroupChannelCycles[TransferGroup::Weights]) :
+                0;
+        accessCycles.scalesAccessCycles =
+            totalGroupBytes.count(TransferGroup::Scales) ?
+                std::max(int64_t(totalGroupBytes[TransferGroup::Scales] / mem->Bandwidth()), maxGroupChannelCycles[TransferGroup::Scales]) :
+                0;
+        accessCycles.totalAccessCycles = std::max(int64_t(totalBytes / mem->Bandwidth()), maxChannelCycles);
+        memoryAccessCycles[mem] = accessCycles;
+    }
+    return memoryAccessCycles;
+}
+
 }  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp
index 5831023a..a4770348 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp
@@ -33,6 +33,18 @@ struct EthosU85PerfInfo
     float activationCycles[3];
 };
 
+enum class MemChannel
+{
+    None = 0,
+    Mem2Mem,
+    IFMStream,
+    Weight,
+    FastWeight,
+    IFM,
+    Scale,
+    Write,
+};
+
 struct EthosU85Cycles
 {
     int64_t cycles = 0;
@@ -72,15 +84,20 @@ public:
     ElementAccess ElementTransferToBytes(const PerformanceQuery &query, const ElementAccess &access) override;
     int64_t WeightDecodeCycles(const PerformanceQuery &query, const WeightStats &weights, Flags<WeightFormat> format,
         ArchitectureMemory *weightsMemory) override;
-    float ChannelBW(const ArchitectureMemory *mem, MemChannel channel) override;
     void InitDatabase(Database *optDB) override;
     void RecordToDB(int opId) override;
+    int64_t MinReadCycles(ArchitectureMemory *mem, int size, TensorUsage usage, OpType type, bool fastWeights) override;
+    int64_t MinWriteCycles(ArchitectureMemory *mem, int size) override;
+    std::unordered_map<const ArchitectureMemory *, AccessCycles>
+    MeasureAccessCycles(const PerformanceQuery &query, const ElementAccess &byteAccess) override;
 
 private:
     EthosU85Cycles EstimateConvCycles(const PerformanceQuery &query, const std::vector<FusionQuery> &fused);
     EthosU85ElementCycles EstimateOutputCyclesPerElement(const PerformanceQuery &query, const std::vector<FusionQuery> &fused);
     EthosU85Cycles EstimateElementwiseCycles(const PerformanceQuery &query, const std::vector<FusionQuery> &fused);
     int64_t EstimateMinimumMemoryCycles(const PerformanceQuery &query);
+    float ChannelBW(const ArchitectureMemory *mem, MemChannel channel);
+    static MemChannel LookupChannel(OpType type, TensorUsage usage, bool fastWeights);
 };
 
 }  // namespace regor
diff --git a/ethosu/regor/compiler/network_performance.cpp b/ethosu/regor/compiler/network_performance.cpp
index 67b5047c..1f2db8e9 100644
--- a/ethosu/regor/compiler/network_performance.cpp
+++ b/ethosu/regor/compiler/network_performance.cpp
@@ -287,7 +287,7 @@ void NetworkPerformance::AddToDatabase(const PerformanceResult &perf, SchedulerO
 
     for ( const auto mem : memories )
     {
-        row.push_back(std::to_string(perf.memory.at(mem).AccessCycles()));
+        row.push_back(std::to_string(perf.memory.at(mem).accessCycles));
     }
 
     db->AddRow(opTable, schedOp->Uid(), std::move(row));
@@ -436,7 +436,7 @@ PerformanceResult NetworkPerformance::EstimateFullOpPerformance(
 
     ElementAccess access = _arch->Performance()->MeasureElementAccess(query);
     ElementAccess byteAccess = _arch->Performance()->ElementTransferToBytes(query, access);
-
+    auto memoryAccessCycles = _arch->Performance()->MeasureAccessCycles(query, byteAccess);
     // How many NPU cycles are available under the previously executing
     // operator for performing buffered DMA transfers
     int64_t slackCycles = (prevCost != nullptr) ? prevCost->slackBufferingCycles : 0;
@@ -458,6 +458,9 @@ PerformanceResult NetworkPerformance::EstimateFullOpPerformance(
 
             result.memory[srcMemory].access[AccessType::Lut].bytesRead += copySize;
             result.memory[dstMemory].access[AccessType::Lut].bytesWritten += copySize;
+            // TODO: Add lut transfers through MeasureAccessCycles() instead
+            result.memory[srcMemory].access[AccessType::Lut].accessCycles += copySize / srcMemory->Bandwidth();
+            result.memory[dstMemory].access[AccessType::Lut].accessCycles += copySize / dstMemory->Bandwidth();
         }
     }
 
@@ -543,28 +546,26 @@ PerformanceResult NetworkPerformance::EstimateFullOpPerformance(
         result.memory[weightsMemory].access[AccessType::Scales].bytesRead += byteAccess.constRead[1];
     }
 
+    for ( auto &[mem, accessCycles] : memoryAccessCycles )
+    {
+        assert(result.memory.count(mem) > 0);
+        result.memory[mem].accessCycles = accessCycles.totalAccessCycles;
+        result.memory[mem].access[AccessType::FeatureMap].accessCycles = accessCycles.fmAccessCycles;
+        result.memory[mem].access[AccessType::Weights].accessCycles = accessCycles.weightsAccessCycles;
+        result.memory[mem].access[AccessType::Scales].accessCycles = accessCycles.scalesAccessCycles;
+    }
+
     // Update memory-access cycles and find the maximum memory read cycle time
     int64_t maxMemCycles = 0;
     for ( auto &[mem, stats] : result.memory )
     {
         int64_t totalReadBytes = 0;
         int64_t totalWriteBytes = 0;
-        float bandwidth = mem->Bandwidth();
-        int64_t memBytes = 0;
         for ( auto &[accType, acc] : stats.access )
         {
-            // compute cycles per accessType
-            int64_t bytes = acc.bytesRead + acc.bytesWritten;
-            memBytes += bytes;
-            int64_t accCycles = int64_t(float(bytes) / bandwidth);
-            acc.accessCycles = accCycles;
             totalReadBytes += acc.bytesRead;
             totalWriteBytes += acc.bytesWritten;
         }
-        // get maximum cycles per memory
-        int64_t memCycles = int64_t(float(memBytes) / bandwidth);
-        maxMemCycles = std::max(maxMemCycles, memCycles);
-
         if ( totalReadBytes > 0 )
         {
             stats.readTransferEff = float(totalReadBytes - stats.readTransferOverhead) / totalReadBytes;
@@ -573,6 +574,7 @@ PerformanceResult NetworkPerformance::EstimateFullOpPerformance(
         {
             stats.writeTransferEff = float(totalWriteBytes - stats.writeTransferOverhead) / totalWriteBytes;
         }
+        maxMemCycles = std::max(maxMemCycles, stats.accessCycles);
     }
 
     result.totalCycles = std::max(result.npuCycles, maxMemCycles);
diff --git a/ethosu/regor/compiler/network_performance.hpp b/ethosu/regor/compiler/network_performance.hpp
index 588eb06d..b379e0e1 100644
--- a/ethosu/regor/compiler/network_performance.hpp
+++ b/ethosu/regor/compiler/network_performance.hpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2021-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -67,15 +67,7 @@ struct PerformanceResult
         int64_t writeTransferOverhead = 0;
         float readTransferEff = 1;
         float writeTransferEff = 1;
-        int64_t AccessCycles() const
-        {
-            int64_t cycles = 0;
-            for ( const auto &[type, acc] : access )
-            {
-                cycles += acc.accessCycles;
-            }
-            return cycles;
-        }
+        int64_t accessCycles = 0;
 
         MemoryAccesses &operator+=(const MemoryAccesses &other)
         {
@@ -88,7 +80,7 @@ struct PerformanceResult
         }
     };
 
-    std::unordered_map<ArchitectureMemory *, MemoryAccesses> memory;
+    std::unordered_map<const ArchitectureMemory *, MemoryAccesses> memory;
     int64_t npuCycles = 0;
     int64_t cpuCycles = 0;
     int64_t totalCycles = 0;
diff --git a/ethosu/regor/compiler/scheduler.cpp b/ethosu/regor/compiler/scheduler.cpp
index 6558ea5e..c0509540 100644
--- a/ethosu/regor/compiler/scheduler.cpp
+++ b/ethosu/regor/compiler/scheduler.cpp
@@ -1089,12 +1089,17 @@ void Scheduler::ProposeWeightBuffering(SchedulerConnection *weights, SchedulerCo
         forceFullDepthSlice = FulldepthWeightBuffering(_ops, weightTens, schedOp, cost, prevOp, prevCost, refSchedule);
     }
 
+    // Estimate the buffering cycle time for the full set of weights
+    int64_t fullTransferCycles = _arch->Performance()->MemToMemCycles(_arch->StagingMemory().memory, weightTens->memArea.memory, fullWeightsBytes);
+
+
     if ( _spilling && !forceFullDepthSlice )
     {
         // To be refined and architecture specific depending on mem2mem characteristics and prebuffering
         float bwRatio = std::round(
-            _arch->Performance()->ChannelBW(weightTens->memArea.memory, MemChannel::Weight) /
-            _arch->Performance()->ChannelBW(weightTens->memArea.memory, MemChannel::Mem2Mem));
+            fullTransferCycles /
+            _arch->Performance()->MinReadCycles(weightTens->memArea.memory, fullWeightsBytes, TensorUsage::Weights,
+                schedOp->Type(), weightFormat % WeightFormat::Fast));
         needsDMA = (cost->elementAccess.weightsRefetch > 2) || (cost->elementAccess.weightsRefetch == 2 && bwRatio < 2);
     }
 
@@ -1132,9 +1137,6 @@ void Scheduler::ProposeWeightBuffering(SchedulerConnection *weights, SchedulerCo
     }
     else
     {
-        // Estimate the buffering cycle time for the full set of weights
-        int64_t fullTransferCycles = _arch->Performance()->MemToMemCycles(
-            _arch->StagingMemory().memory, weightTens->memArea.memory, fullWeightsBytes);
         cost->fullWeightTransferCycles = fullTransferCycles;
 
         // Calculate the amount of pre-buffering necessary (or what is possible with limited
@@ -1716,6 +1718,15 @@ PerformanceQuery Scheduler::InitPerfQuery(
         query.encodedWeightSize = unsigned(weightBytes * ratio);
         query.encodedScaleSize = unsigned(scaleBytes * ratio);
         query.constMemory = cost->npuWeightsTensor->memArea.memory;
+        if ( cost->bufferedWeightTensor.tensor )
+        {
+            query.weightStagingMemory = cost->bufferedWeightTensor.tensor->memArea.memory;
+            if ( cost->bufferedWeightTensor.preBuffer )
+            {
+                auto preBufferRatio = float(cost->ofmDepthSlices[1]) / cost->ofmDepthSlices.back();
+                query.firstWeightDMASize = query.encodedWeightSize * preBufferRatio;
+            }
+        }
     }
 
     query.weightFormat = wgtFormat;
-- 
GitLab