From a3a1df53e0c861ab2e3d85d14e6d118d58d211e0 Mon Sep 17 00:00:00 2001
From: Philip Hall <philip.hall@arm.com>
Date: Wed, 5 Mar 2025 12:47:17 +0000
Subject: [PATCH] MLBEDSW-10106: Update Ethos-U55 MatMul performance stats

 - Updated performance calculations for the Ethos-U55 MatMul
   implementation. This is required to maintain the
   Ethos-U55/Ethos-U85 abstraction (both must return a result)
   when using the performance interface.
 - Fixed incomplete implementation of encoded weights byte
   transfer values.
 - Replaced manual datatype related scaling to use the DataType
   scaling functions.

Signed-off-by: Philip Hall <philip.hall@arm.com>
Change-Id: I7c8deb4e2740518874530786481d4ef57822bac4
---
 ethosu/regor/architecture/architecture.hpp    |   3 +
 .../ethosu55/ethos_u55_performance.cpp        | 113 ++++++++++++++----
 .../ethosu55/ethos_u55_performance.hpp        |   5 +-
 .../ethos_u55_register_cs_generator.cpp       |   2 +-
 .../ethosu85/ethos_u85_performance.cpp        |  55 +++++----
 .../ethosu85/ethos_u85_performance.hpp        |   4 +-
 ethosu/regor/compiler/network_performance.cpp |  34 +++---
 ethosu/regor/compiler/scheduler.cpp           |  25 +++-
 ethosu/regor/compiler/scheduler.hpp           |   2 +-
 ethosu/regor/compiler/scheduler_packing.cpp   |   2 +-
 10 files changed, 174 insertions(+), 71 deletions(-)

diff --git a/ethosu/regor/architecture/architecture.hpp b/ethosu/regor/architecture/architecture.hpp
index 993df631..5967770e 100644
--- a/ethosu/regor/architecture/architecture.hpp
+++ b/ethosu/regor/architecture/architecture.hpp
@@ -243,6 +243,9 @@ struct PerformanceQuery
     Shape constShape;
     ArchitectureMemory *constMemory;
     WeightFormat weightFormat;
+    ArchitectureMemory *tmpMemory;
+    unsigned encodedWeightSize;
+    unsigned encodedScaleSize;
 };
 
 struct WeightStats
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp
index 82aa1ddd..26c83483 100644
--- a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp
@@ -61,34 +61,19 @@ CycleCost EthosU55Performance::MeasureCycleCost(const PerformanceQuery &query, c
     auto npuOp = _arch->GetHWOp(query.type);
     const bool recordToDb = _db && _nextId != -1;
 
-
     // Convolution/Vector product cycle calculation
     if ( OpUsesMacs(npuOp) )
     {
-        if ( (npuOp == EthosU55NpuOp::Depthwise) || (npuOp == EthosU55NpuOp::Pooling) )
-        {
-            cycles.macs = int64_t(query.kernel->ElementsWH()) * query.ofmShape.Elements() * 1;
-        }
-        else
-        {
-            cycles.macs = int64_t(query.kernel->ElementsWH()) * query.ofmShape.Elements() * query.ifmShape[0].Depth();
-        }
-
         cycleComponents = EstimateConvCycles(query, fused);
+        cycles.macs = cycleComponents.macs;
         cycles.opCycles = cycleComponents.cycles;
     }
     // Elementwise cycle calculation
     else if ( npuOp == EthosU55NpuOp::Elementwise )
     {
-        auto [totCCPerElem, aoCCPerElem, cmdCCPerElem] = EstimateOutputCyclesPerElement(query, fused);
-        auto ofmShape =
-            (query.ofmFormat == TensorFormat::NHCWB16) ? Shape::RoundAway(query.ofmShape, Shape(1, 1, 1, 16)) : query.ofmShape;
-        cycles.opCycles = int64_t(totCCPerElem * float(ofmShape.Elements()));
-        if ( recordToDb )
-        {
-            cycleComponents.aoCycles = int64_t(aoCCPerElem * float(ofmShape.Elements()));
-            cycleComponents.cmdCycles = int64_t(cmdCCPerElem * float(ofmShape.Elements()));
-        }
+        cycleComponents = EstimateElementwiseCycles(query, fused);
+        cycles.macs = cycleComponents.macs;
+        cycles.opCycles = cycleComponents.cycles;
     }
     else if ( npuOp == EthosU55NpuOp::Dma )
     {
@@ -97,9 +82,25 @@ CycleCost EthosU55Performance::MeasureCycleCost(const PerformanceQuery &query, c
     }
     else if ( npuOp == EthosU55NpuOp::Compound )
     {
-        // TODO: Measure variable-implementation ops
         assert(query.type == OpType::Transpose || query.type == OpType::MatMul);
-        cycles.opCycles = EstimateMinimumMemoryCycles(query);
+        if ( query.type == OpType::MatMul )
+        {
+            cycleComponents = EstimateMatMulCycles(query, fused);
+            cycles.macs = cycleComponents.macs;
+            cycles.opCycles = cycleComponents.cycles;
+        }
+        else
+        {
+            // TODO: Measure variable-implementation ops
+            // (default estimation based on memory access)
+            ElementAccess estimate = MeasureElementAccess(query);
+            estimate = ElementTransferToBytes(query, estimate);
+            assert(query.ifmMemory[0] && query.ofmMemory);
+            int64_t fromCycles =
+                int64_t(float(estimate.ifmRead[0]) / query.ifmMemory[0]->Bandwidth()) + query.ifmMemory[0]->ReadLatency();
+            int64_t toCycles = int64_t(float(estimate.ofmWrite) / query.ofmMemory->Bandwidth()) + query.ofmMemory->WriteLatency();
+            cycles.opCycles = std::max(fromCycles, toCycles);
+        }
     }
     else
     {
@@ -131,7 +132,6 @@ CycleCost EthosU55Performance::MeasureCycleCost(const PerformanceQuery &query, c
         _nextId = -1;
     }
 
-
     return cycles;
 }
 
@@ -308,7 +308,63 @@ EthosU55Cycles EthosU55Performance::EstimateConvCycles(const PerformanceQuery &q
         cmdCycles = cmdCycles * numOfmBlks + cyclesDpuBlk;
     }
 
-    return {totalCycles, cyclesDpu, cyclesAO, cmdCycles};
+    int64_t totalMacs = int64_t(query.kernel->ElementsWH()) * query.ofmShape.Elements();
+    if ( (npuOp != EthosU55NpuOp::Depthwise) && (npuOp != EthosU55NpuOp::Pooling) )
+    {
+        totalMacs *= query.ifmShape[0].Depth();
+    }
+
+    return {totalCycles, cyclesDpu, cyclesAO, cmdCycles, totalMacs};
+}
+
+EthosU55Cycles EthosU55Performance::EstimateElementwiseCycles(const PerformanceQuery &query, const std::vector<FusionQuery> &fused)
+{
+    auto [totCCPerElem, aoCCPerElem, cmdCCPerElem] = EstimateOutputCyclesPerElement(query, fused);
+    auto ofmShape =
+        (query.ofmFormat == TensorFormat::NHCWB16) ? Shape::RoundAway(query.ofmShape, Shape(1, 1, 1, 16)) : query.ofmShape;
+    float elements = float(ofmShape.Elements64());
+    EthosU55Cycles cycleComponents{};
+    cycleComponents.cycles = int64_t(totCCPerElem * elements);
+    cycleComponents.aoCycles = int64_t(aoCCPerElem * elements);
+    cycleComponents.cmdCycles = int64_t(cmdCCPerElem * elements);
+    return cycleComponents;
+}
+
+EthosU55Cycles EthosU55Performance::EstimateMatMulCycles(const PerformanceQuery &query, const std::vector<FusionQuery> &fused)
+{
+    // Query the cost of individual parts of the matmul implementation
+    EthosU55OpConfig *config = static_cast<EthosU55OpConfig *>(query.config);
+    PerformanceQuery subQuery = query;
+
+    // Mul cost
+    subQuery.type = OpType::Mul;
+    subQuery.config = config->PrevConfig();
+    subQuery.ofmShape = query.ifmShape[0];
+    subQuery.ifmShape[1] = query.ifmShape[0];
+    subQuery.ofmType = DataType::Int32;
+    subQuery.ofmMemory = query.tmpMemory;
+    EthosU55Cycles mulCost = EstimateElementwiseCycles(subQuery, fused);
+
+    // ReduceSum cost
+    subQuery.type = OpType::ReduceSum;
+    subQuery.config = config;
+    subQuery.ifmShape[1] = Shape();
+    subQuery.ifmMemory[0] = query.tmpMemory;
+    subQuery.ofmShape = subQuery.ofmShape.WithDepth(1);
+    subQuery.ofmType = query.ofmType;
+    subQuery.ofmMemory = query.ofmMemory;
+    EthosU55Cycles sumCost = EstimateConvCycles(subQuery, fused);
+
+    // Repeat for every column of the ofm
+    int cols = query.ifmShape[1].Width();
+    EthosU55Cycles cycles{};
+    cycles.macs = (mulCost.macs + sumCost.macs) * cols;
+    cycles.cycles = (mulCost.cycles + sumCost.cycles) * cols;
+    cycles.aoCycles = (mulCost.aoCycles + sumCost.aoCycles) * cols;
+    cycles.cmdCycles = (mulCost.cmdCycles + sumCost.cmdCycles) * cols;
+    cycles.macCycles = (mulCost.macCycles + sumCost.macCycles) * cols;
+
+    return cycles;
 }
 
 static int EstimateMemoryTransfer(int cores, bool isRead, ArchitectureMemory *memory, TensorFormat format,
@@ -603,9 +659,14 @@ ElementAccess EthosU55Performance::ElementTransferToBytes(const PerformanceQuery
     result.ofmWrite = EstimateMemoryTransfer(_arch->_cores, false, query.ofmMemory, query.ofmFormat,
         DataTypeSizeBits(query.ofmType), opConfig->OfmBlock(), query.ofmShape, access.ofmWrite);
 
-    // These requires compression ratio information
-    result.constRead[0] = 0;
-    result.constRead[1] = 0;
+    // Use encoded information from query to estimate weight reads if present
+    result.constRead[0] = result.constRead[1] = 0;
+    if ( query.encodedWeightSize )
+    {
+        result.constRead[0] = access.weightsRefetch * query.encodedWeightSize;
+        result.constRead[1] = access.weightsRefetch * query.encodedScaleSize;
+        result.weightsRefetch = 1;
+    }
 
     return result;
 }
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp
index c3b8d2a5..6d50c1d0 100644
--- a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2021-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -39,6 +39,7 @@ struct EthosU55Cycles
     int64_t macCycles;
     int64_t aoCycles;
     int64_t cmdCycles;
+    int64_t macs;
 };
 
 struct EthosU55ElementCycles
@@ -78,6 +79,8 @@ public:
 
 private:
     EthosU55Cycles EstimateConvCycles(const PerformanceQuery &query, const std::vector<FusionQuery> &fused);
+    EthosU55Cycles EstimateElementwiseCycles(const PerformanceQuery &query, const std::vector<FusionQuery> &fused);
+    EthosU55Cycles EstimateMatMulCycles(const PerformanceQuery &query, const std::vector<FusionQuery> &fused);
     EthosU55ElementCycles EstimateOutputCyclesPerElement(const PerformanceQuery &query, const std::vector<FusionQuery> &fused);
     int64_t EstimateMinimumMemoryCycles(const PerformanceQuery &query);
 };
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp
index 736b129e..d1942579 100644
--- a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp
@@ -1465,7 +1465,7 @@ void EthosU55RCSGenerator::InsertTransposeCommand(const HLCStripe *stripe, Tempo
                     query.ofmShape = outFM.shape;
                     query.ofmFormat = TensorFormat::NHWC;
                     query.transpose = ofm.transpose;
-                    temps.configs.push_back(_arch->FindBlockConfig(cmd->operation->type, query));
+                    temps.configs.push_back(_arch->GetOpConfig(cmd->operation->type, query));
                 }
                 cmd->operation->config = temps.configs.back().get();
                 // Add to CMD list
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp
index 302f70b5..2519623d 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp
@@ -72,32 +72,16 @@ CycleCost EthosU85Performance::MeasureCycleCost(const PerformanceQuery &query, c
     // Convolution/Vector product cycle calculation
     if ( OpUsesMacs(npuOp) )
     {
-        if ( npuOp == EthosU85NpuOp::Depthwise || npuOp == EthosU85NpuOp::Pooling ||
-             npuOp == EthosU85NpuOp::ReduceMinMax || npuOp == EthosU85NpuOp::ArgMax )
-        {
-            cycles.macs = int64_t(query.kernel->ElementsWH()) * query.ofmShape.Elements();
-        }
-        else
-        {
-            cycles.macs = int64_t(query.kernel->ElementsWH()) * query.ofmShape.Elements() * query.ifmShape[0].Depth();
-        }
-        cycles.macs /= sparse ? 2 : 1;
-
         cycleComponents = EstimateConvCycles(query, fused);
+        cycles.macs /= sparse ? 2 : 1;
         cycles.opCycles = cycleComponents.cycles;
     }
     // Elementwise cycle calculation
     else if ( npuOp == EthosU85NpuOp::Elementwise )
     {
-        auto [totCCPerElem, aoCCPerElem, cmdCCPerElem] = EstimateOutputCyclesPerElement(query, fused);
-        auto ofmShape =
-            (query.ofmFormat == TensorFormat::NHCWB16) ? Shape::RoundAway(query.ofmShape, Shape(1, 1, 1, 16)) : query.ofmShape;
-        cycles.opCycles = int64_t(totCCPerElem * float(ofmShape.Elements()));
-        if ( recordToDb )
-        {
-            cycleComponents.aoCycles = int64_t(aoCCPerElem * float(ofmShape.Elements()));
-            cycleComponents.cmdCycles = int64_t(cmdCCPerElem * float(ofmShape.Elements()));
-        }
+        cycleComponents = EstimateElementwiseCycles(query, fused);
+        cycles.macs = cycleComponents.macs;
+        cycles.opCycles = cycleComponents.cycles;
     }
     // Resize cycle calculation
     else if ( npuOp == EthosU85NpuOp::Resize )
@@ -298,7 +282,25 @@ EthosU85Cycles EthosU85Performance::EstimateConvCycles(const PerformanceQuery &q
         cmdCycles = cmdCycles * numOfmBlks + cyclesDpuBlk;
     }
 
-    return {totalCycles, cyclesDpu, cyclesAO, cmdCycles};
+    int64_t totalMacs = int64_t(query.kernel->ElementsWH()) * query.ofmShape.Elements();
+    if ( npuOp == EthosU85NpuOp::Depthwise || npuOp == EthosU85NpuOp::Pooling || npuOp == EthosU85NpuOp::ReduceMinMax || npuOp == EthosU85NpuOp::ArgMax )
+    {
+        totalMacs *= query.ifmShape[0].Depth();
+    }
+    return {totalCycles, cyclesDpu, cyclesAO, cmdCycles, totalMacs};
+}
+
+EthosU85Cycles EthosU85Performance::EstimateElementwiseCycles(const PerformanceQuery &query, const std::vector<FusionQuery> &fused)
+{
+    auto [totCCPerElem, aoCCPerElem, cmdCCPerElem] = EstimateOutputCyclesPerElement(query, fused);
+    auto ofmShape =
+        (query.ofmFormat == TensorFormat::NHCWB16) ? Shape::RoundAway(query.ofmShape, Shape(1, 1, 1, 16)) : query.ofmShape;
+    float elements = float(ofmShape.Elements64());
+    EthosU85Cycles cycleComponents{};
+    cycleComponents.cycles = int64_t(totCCPerElem * elements);
+    cycleComponents.aoCycles = int64_t(aoCCPerElem * elements);
+    cycleComponents.cmdCycles = int64_t(cmdCCPerElem * elements);
+    return cycleComponents;
 }
 
 static int64_t EstimateMemoryTransfer(int cores, bool isRead, ArchitectureMemory *memory, TensorFormat format,
@@ -585,9 +587,14 @@ ElementAccess EthosU85Performance::ElementTransferToBytes(const PerformanceQuery
     result.ofmWrite = EstimateMemoryTransfer(_arch->_cores, false, query.ofmMemory, query.ofmFormat,
         DataTypeSizeBits(query.ofmType), ofmBlock, query.ofmShape, access.ofmWrite);
 
-    // These requires compression ratio information
-    result.constRead[0] = 0;
-    result.constRead[1] = 0;
+    // Use encoded information from query to estimate weight reads if present
+    result.constRead[0] = result.constRead[1] = 0;
+    if ( query.encodedWeightSize )
+    {
+        result.constRead[0] = access.weightsRefetch * query.encodedWeightSize;
+        result.constRead[1] = access.weightsRefetch * query.encodedScaleSize;
+        result.weightsRefetch = 1;
+    }
 
     return result;
 }
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp
index a048e99e..72cdd42f 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2021-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -39,6 +39,7 @@ struct EthosU85Cycles
     int64_t macCycles = 0;
     int64_t aoCycles = 0;
     int64_t cmdCycles = 0;
+    int64_t macs = 0;
 };
 
 struct EthosU85ElementCycles
@@ -79,6 +80,7 @@ public:
 private:
     EthosU85Cycles EstimateConvCycles(const PerformanceQuery &query, const std::vector<FusionQuery> &fused);
     EthosU85ElementCycles EstimateOutputCyclesPerElement(const PerformanceQuery &query, const std::vector<FusionQuery> &fused);
+    EthosU85Cycles EstimateElementwiseCycles(const PerformanceQuery &query, const std::vector<FusionQuery> &fused);
     int64_t EstimateMinimumMemoryCycles(const PerformanceQuery &query);
 };
 
diff --git a/ethosu/regor/compiler/network_performance.cpp b/ethosu/regor/compiler/network_performance.cpp
index dc88940d..b0995346 100644
--- a/ethosu/regor/compiler/network_performance.cpp
+++ b/ethosu/regor/compiler/network_performance.cpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2021-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -402,7 +402,7 @@ PerformanceResult NetworkPerformance::EstimateFullOpPerformance(
 {
     UNUSED(prevOp);
     auto wgtFormat = cost->npuWeightsTensor ? cost->npuWeightsTensor->config->Format() : Flags<WeightFormat>(WeightFormat::Default);
-    PerformanceQuery query = Scheduler::InitPerfQuery(schedOp, cost->Config(), -1, wgtFormat);
+    PerformanceQuery query = Scheduler::InitPerfQuery(schedOp, cost->Config(), -1, wgtFormat, cost);
     std::vector<FusionQuery> fused = Scheduler::InitFusionQuery(schedOp);
 
     // Memory that NPU will source weights from for operations
@@ -503,35 +503,41 @@ PerformanceResult NetworkPerformance::EstimateFullOpPerformance(
     auto ofm = schedOp->OFM();
     result.memory[ofm->tensor->memArea.memory].access[AccessType::FeatureMap].bytesWritten += byteAccess.ofmWrite;
     result.memory[ofm->tensor->memArea.memory]
-        .writeTransferOverhead += byteAccess.ofmWrite - DataTypeSizeBits(ofm->tensor->dataType) / 8 * access.ofmWrite;
+        .writeTransferOverhead += byteAccess.ofmWrite - DataTypeStorageSizeBytes(ofm->tensor->dataType, access.ofmWrite);
 
     // IFM1 read
     auto ifm = schedOp->IFM(0);
     result.memory[ifm->tensor->memArea.memory].access[AccessType::FeatureMap].bytesRead += byteAccess.ifmRead[0];
     result.memory[ifm->tensor->memArea.memory]
-        .readTransferOverhead += byteAccess.ifmRead[0] - DataTypeSizeBits(ifm->tensor->dataType) / 8 * access.ifmRead[0];
+        .readTransferOverhead += byteAccess.ifmRead[0] - DataTypeStorageSizeBytes(ifm->tensor->dataType, access.ifmRead[0]);
 
     // IFM2 read
     auto ifm2 = schedOp->TryIFM(1);
     if ( ifm2 )
     {
         result.memory[ifm2->tensor->memArea.memory].access[AccessType::FeatureMap].bytesRead += byteAccess.ifmRead[1];
-        result.memory[ifm2->tensor->memArea.memory]
-            .readTransferOverhead += byteAccess.ifmRead[1] - DataTypeSizeBits(ifm2->tensor->dataType) / 8 * access.ifmRead[1];
+        result.memory[ifm2->tensor->memArea.memory].readTransferOverhead +=
+            byteAccess.ifmRead[1] - DataTypeStorageSizeBytes(ifm2->tensor->dataType, access.ifmRead[1]);
     }
 
-    // Weight read
-    if ( cost->npuWeightsTensor && access.constRead[0] > 0 )
+    // Reads/writes to temporary or intermediate memories
+    auto scratch = schedOp->TryInput(TensorUsage::Scratch);
+    if ( scratch )
     {
-        int encodedWeightsSize = cost->npuWeightsTensor->totalWeightBytes;
-        result.memory[weightsMemory].access[AccessType::Weights].bytesRead += int64_t(encodedWeightsSize) * access.weightsRefetch;
+        result.memory[scratch->tensor->memArea.memory].access[AccessType::FeatureMap].bytesRead += byteAccess.tmpRead;
+        result.memory[scratch->tensor->memArea.memory]
+            .readTransferOverhead += byteAccess.tmpRead - DataTypeStorageSizeBytes(scratch->tensor->dataType, access.tmpRead);
+
+        result.memory[scratch->tensor->memArea.memory].access[AccessType::FeatureMap].bytesWritten += byteAccess.tmpWrite;
+        result.memory[scratch->tensor->memArea.memory].readTransferOverhead +=
+            byteAccess.tmpWrite - DataTypeStorageSizeBytes(scratch->tensor->dataType, access.tmpWrite);
     }
 
-    // Scale read
-    if ( cost->npuWeightsTensor && access.constRead[1] > 0 )
+    // Weight/scale reads
+    if ( cost->npuWeightsTensor )
     {
-        int encodedScaleSize = cost->npuWeightsTensor->AllocationSizeBytes() - cost->npuWeightsTensor->totalWeightBytes;
-        result.memory[weightsMemory].access[AccessType::Scales].bytesRead += int64_t(encodedScaleSize) * access.weightsRefetch;
+        result.memory[weightsMemory].access[AccessType::Weights].bytesRead += byteAccess.constRead[0];
+        result.memory[weightsMemory].access[AccessType::Scales].bytesRead += byteAccess.constRead[1];
     }
 
     // Update memory-access cycles and find the maximum memory read cycle time
diff --git a/ethosu/regor/compiler/scheduler.cpp b/ethosu/regor/compiler/scheduler.cpp
index 06d007a9..e2cd127a 100644
--- a/ethosu/regor/compiler/scheduler.cpp
+++ b/ethosu/regor/compiler/scheduler.cpp
@@ -1644,7 +1644,8 @@ void Scheduler::CoalesceWeightBufferTensors(Schedule *schedule)
 }
 
 
-PerformanceQuery Scheduler::InitPerfQuery(SchedulerOperation *op, ArchitectureOpConfig *config, int ofmDepth, WeightFormat wgtFormat)
+PerformanceQuery Scheduler::InitPerfQuery(
+    SchedulerOperation *op, ArchitectureOpConfig *config, int ofmDepth, WeightFormat wgtFormat, SchedulerOpInfo *cost)
 {
     PerformanceQuery query = {};
     query.type = op->Type();
@@ -1667,11 +1668,18 @@ PerformanceQuery Scheduler::InitPerfQuery(SchedulerOperation *op, ArchitectureOp
     }
 
     SchedulerConnection *ofm = op->OFM();
-    query.ofmShape = (ofmDepth >= 0) ? ofm->SliceShape().WithDepth(ofmDepth) : ofm->SliceShape();
+    ofmDepth = (ofmDepth >= 0) ? ofmDepth : ofm->SliceShape().Depth();
+    query.ofmShape = ofm->SliceShape().WithDepth(ofmDepth);
     query.ofmMemory = ofm->tensor->memArea.memory;
     query.ofmType = ofm->tensor->dataType;
     query.ofmFormat = ofm->tensor->format;
 
+    SchedulerConnection *scratch = op->TryInput(TensorUsage::Scratch);
+    if ( scratch )
+    {
+        query.tmpMemory = scratch->tensor->memArea.memory;
+    }
+
     SchedulerConnection *scales = op->TryInput(TensorUsage::Scales);
     if ( scales )
     {
@@ -1679,6 +1687,19 @@ PerformanceQuery Scheduler::InitPerfQuery(SchedulerOperation *op, ArchitectureOp
         query.constMemory = scales->tensor->memArea.memory;
     }
 
+    // If post-schedule cost is available, update with encoded sizes
+    if ( cost && cost->npuWeightsTensor )
+    {
+        float ratio = float(ofmDepth) / ofm->SliceShape().Depth();
+        unsigned weightBytes = cost->npuWeightsTensor->totalWeightBytes;
+        unsigned scaleBytes = cost->npuWeightsTensor->AllocationSizeBytes() - weightBytes;
+
+        // Encoded weight and scale sizes, estimated as a proportion if sliced.
+        query.encodedWeightSize = unsigned(weightBytes * ratio);
+        query.encodedScaleSize = unsigned(scaleBytes * ratio);
+        query.constMemory = cost->npuWeightsTensor->memArea.memory;
+    }
+
     query.weightFormat = wgtFormat;
 
     return query;
diff --git a/ethosu/regor/compiler/scheduler.hpp b/ethosu/regor/compiler/scheduler.hpp
index d9313f3f..c137540b 100644
--- a/ethosu/regor/compiler/scheduler.hpp
+++ b/ethosu/regor/compiler/scheduler.hpp
@@ -303,7 +303,7 @@ public:
     void AllocateReadOnlyAddresses(Schedule *schedule, IncrementalLinearAllocator &readOnlyAllocator);
 
     static PerformanceQuery InitPerfQuery(SchedulerOperation *op, ArchitectureOpConfig *config, int ofm_depth = -1,
-        WeightFormat wgtFormat = WeightFormat::Default);
+        WeightFormat wgtFormat = WeightFormat::Default, SchedulerOpInfo *cost = nullptr);
     static std::vector<FusionQuery> InitFusionQuery(SchedulerOperation *op);
 
 private:
diff --git a/ethosu/regor/compiler/scheduler_packing.cpp b/ethosu/regor/compiler/scheduler_packing.cpp
index 215caec3..0007053b 100644
--- a/ethosu/regor/compiler/scheduler_packing.cpp
+++ b/ethosu/regor/compiler/scheduler_packing.cpp
@@ -558,7 +558,7 @@ std::unique_ptr<SchedulerOperation> SchedulerPacking::MakeSchedulerOperation(Ope
             auto scratchTensor = std::make_shared<SchedulerTensor>(req.scratch.type, req.scratch.size, req.scratch.format);
             SchedulerConnection *scratchConn = schedOp->AddInput(TensorUsage::Scratch0, scratchTensor);
             scratchConn->shape = req.scratch.size;
-            scratchTensor->memArea = _arch->StagingMemory();
+            scratchTensor->memArea = _arch->FeatureMapMemory();
         }
     }
 
-- 
GitLab