From 163f4e86f5b8cc9d7b1b6ee91907a13ff96fd2de Mon Sep 17 00:00:00 2001
From: William Isaksson <william.isaksson@arm.com>
Date: Thu, 19 Dec 2024 16:19:42 +0100
Subject: [PATCH] MLBEDSW-10091: Add more performance debug info to database

Adds more performance debug info to the debug database.

Change-Id: Ibaa6b40b8d0d8566af66746257e17257787b599c
Signed-off-by: William Isaksson <william.isaksson@arm.com>
---
 ethosu/regor/CMakeLists.txt                   |   1 +
 ethosu/regor/architecture/architecture.cpp    |   7 +
 ethosu/regor/architecture/architecture.hpp    |   3 +
 .../ethosu55/ethos_u55_performance.cpp        | 121 +++++++--
 .../ethosu55/ethos_u55_performance.hpp        |  25 +-
 .../ethosu85/ethos_u85_performance.cpp        | 113 ++++++++-
 .../ethosu85/ethos_u85_performance.hpp        |  25 +-
 ethosu/regor/compiler/network_performance.cpp | 234 +++++++++++++++++-
 ethosu/regor/compiler/network_performance.hpp |   7 +-
 ethosu/regor/compiler/scheduler_operation.cpp |  29 +++
 ethosu/regor/compiler/shape_util.hpp          |  13 +
 11 files changed, 531 insertions(+), 47 deletions(-)
 create mode 100644 ethosu/regor/compiler/scheduler_operation.cpp
diff --git a/ethosu/regor/CMakeLists.txt b/ethosu/regor/CMakeLists.txt
index 0810b5be..357ec732 100644
--- a/ethosu/regor/CMakeLists.txt
+++ b/ethosu/regor/CMakeLists.txt
@@ -284,6 +284,7 @@ regor_lib(
         "compiler/scheduler.cpp"
         "compiler/scheduler_decompose.cpp"
         "compiler/scheduler_packing.cpp"
+        "compiler/scheduler_operation.cpp"
         "compiler/softmax.cpp"
         "compiler/tensor.cpp"
         "compiler/tensor_allocator.cpp"
diff --git a/ethosu/regor/architecture/architecture.cpp b/ethosu/regor/architecture/architecture.cpp
index ae85546a..8ab2ca3c 100644
--- a/ethosu/regor/architecture/architecture.cpp
+++ b/ethosu/regor/architecture/architecture.cpp
@@ -30,6 +30,13 @@ BEGIN_ENUM_TABLE(regor::MemUsage)
     ADD_ENUM_NAME(Staging)
 END_ENUM_TABLE()
 
+BEGIN_ENUM_TABLE(regor::TensorFormat)
+    ADD_ENUM_NAME(Unknown)
+    ADD_ENUM_NAME(NHWC)
+    ADD_ENUM_NAME(NHCWB16)
+    ADD_ENUM_NAME(WeightsEncoded)
+END_ENUM_TABLE()
+
 namespace regor
 {
 
diff --git a/ethosu/regor/architecture/architecture.hpp b/ethosu/regor/architecture/architecture.hpp
index 0805c029..1b6a8a1b 100644
--- a/ethosu/regor/architecture/architecture.hpp
+++ b/ethosu/regor/architecture/architecture.hpp
@@ -28,6 +28,7 @@
 #include "common/scaling.hpp"
 #include "common/shape.hpp"
 #include "common/transpose_type.hpp"
+#include "compiler/database.hpp"
 #include "compiler/kernel.hpp"
 #include "compiler/op_type.hpp"
 #include "compiler/tensor_properties.hpp"
@@ -305,6 +306,8 @@ public:
     virtual int64_t WeightDecodeCycles(const PerformanceQuery &query, const WeightStats &weights,
         Flags<WeightFormat> format, ArchitectureMemory *weightsMemory) = 0;
     virtual float ChannelBW(const ArchitectureMemory *mem, MemChannel channel) = 0;
+    virtual void InitDatabase(Database *db) = 0;
+    virtual void RecordToDB(int opId) = 0;
 };
 
 enum class IniParseResult
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp
index eaebcba3..be5a3b90 100644
--- a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp
@@ -21,6 +21,7 @@
 #include "common/common.hpp"
 
 #include "architecture/architecture.hpp"
+#include "compiler/shape_util.hpp"
 #include "ethos_u55.hpp"
 
 namespace regor
@@ -56,7 +57,10 @@ CycleCost EthosU55Performance::MeasureCycleCostForSparsity(const PerformanceQuer
 CycleCost EthosU55Performance::MeasureCycleCost(const PerformanceQuery &query, const std::vector<FusionQuery> &fused)
 {
     CycleCost cycles;
+    EthosU55Cycles cycleComponents = {};
     auto npuOp = _arch->GetHWOp(query.type);
+    const bool recordToDb = _db && _nextId != -1;
+
 
     // Convolution/Vector product cycle calculation
     if ( OpUsesMacs(npuOp) )
@@ -70,14 +74,21 @@ CycleCost EthosU55Performance::MeasureCycleCost(const PerformanceQuery &query, c
             cycles.macs = int64_t(query.kernel->ElementsWH()) * query.ofmShape.Elements() * query.ifmShape[0].Depth();
         }
 
-        cycles.opCycles = EstimateConvCycles(query, fused);
+        cycleComponents = EstimateConvCycles(query, fused);
+        cycles.opCycles = cycleComponents.cycles;
     }
     // Elementwise cycle calculation
     else if ( npuOp == EthosU55NpuOp::Elementwise )
     {
+        auto [totCCPerElem, aoCCPerElem, cmdCCPerElem] = EstimateOutputCyclesPerElement(query, fused);
         auto ofmShape =
             (query.ofmFormat == TensorFormat::NHCWB16) ? Shape::RoundAway(query.ofmShape, Shape(1, 1, 1, 16)) : query.ofmShape;
-        cycles.opCycles = int64_t(EstimateOutputCyclesPerElement(query, fused) * float(ofmShape.Elements()));
+        cycles.opCycles = int64_t(totCCPerElem * float(ofmShape.Elements()));
+        if ( recordToDb )
+        {
+            cycleComponents.aoCycles = int64_t(aoCCPerElem * float(ofmShape.Elements()));
+            cycleComponents.cmdCycles = int64_t(cmdCCPerElem * float(ofmShape.Elements()));
+        }
     }
     else if ( npuOp == EthosU55NpuOp::Dma )
     {
@@ -95,6 +106,32 @@ CycleCost EthosU55Performance::MeasureCycleCost(const PerformanceQuery &query, c
         assert(false && "Unknown operator cycle costing");
     }
 
+    if ( recordToDb )
+    {
+        assert(_mainTable != -1);
+        EthosU55OpConfig *opConfig = static_cast<EthosU55OpConfig *>(query.config);
+
+        std::vector<std::string> row = {
+            OpUsesMacs(npuOp) ? std::to_string(cycleComponents.macCycles) : "",
+            std::to_string(cycleComponents.aoCycles),
+            std::to_string(cycleComponents.cmdCycles),
+            opConfig ? EnumToString(opConfig->Traversal()) : "",
+        };
+
+        auto shapeToStrings = [&row](const std::vector<int> &shape)
+        {
+            std::transform(shape.begin(), shape.end(), std::back_inserter(row),
+                [](int n) -> std::string { return n ? std::to_string(n) : ""; });
+        };
+
+        shapeToStrings(ReshapeToNHWC(opConfig ? opConfig->IfmBlock() : Shape()).ToList<int>());
+        shapeToStrings(ReshapeToNHWC(opConfig ? opConfig->OfmBlock() : Shape()).ToList<int>());
+
+        _db->AddRow(_mainTable, _nextId, std::move(row));
+        _nextId = -1;
+    }
+
+
     return cycles;
 }
 
@@ -107,7 +144,7 @@ int64_t EthosU55Performance::MemToMemCycles(const ArchitectureMemory *dest, cons
     return std::max(fromCycles, toCycles);
 }
 
-int64_t EthosU55Performance::EstimateConvCycles(const PerformanceQuery &query, const std::vector<FusionQuery> &fused)
+EthosU55Cycles EthosU55Performance::EstimateConvCycles(const PerformanceQuery &query, const std::vector<FusionQuery> &fused)
 {
     EthosU55OpConfig *opConfig = static_cast<EthosU55OpConfig *>(query.config);
     auto npuOp = _arch->GetHWOp(query.type);
@@ -237,7 +274,11 @@ int64_t EthosU55Performance::EstimateConvCycles(const PerformanceQuery &query, c
 
     // Estimate output cycles
     int numOfmBlks = Shape::DivRoundUp(query.ofmShape, ofmBlock).Elements();
-    int64_t cyclesOutputBlk = int64_t(EstimateOutputCyclesPerElement(query, fused) * float(ofmBlock.Elements()));
+    auto [totCCPerElem, aoCCPerElem, cmdCCPerElem] = EstimateOutputCyclesPerElement(query, fused);
+    auto aoCycles = int64_t(aoCCPerElem * float(ofmBlock.Elements()));
+    auto cmdCycles = int64_t(cmdCCPerElem * float(ofmBlock.Elements()));
+    auto cyclesOutputBlk = int64_t(totCCPerElem * float(ofmBlock.Elements()));
+
 
     // Scale and bias tensor
     if ( query.constShape.Size() > 0 && query.constShape.Depth() > 0 )
@@ -246,23 +287,28 @@ int64_t EthosU55Performance::EstimateConvCycles(const PerformanceQuery &query, c
         cyclesOutputBlk = std::max(cyclesOutputBlk, int64_t(cyclesBiasBlk));
     }
 
-    int64_t cycles_cmd = EstimateMinimumMemoryCycles(query);
-    cycles_cmd = (cycles_cmd + cyclesOutputBlk + cyclesDpuBlk) / 4;  // Per DPU
+    int64_t cmdCycles2 = EstimateMinimumMemoryCycles(query);
+    cmdCycles2 = (cmdCycles2 + cyclesOutputBlk + cyclesDpuBlk) / 4;  // Per DPU
+
+    int64_t cyclesAO = aoCycles * numOfmBlks + cyclesDpuBlk;
+    int64_t cyclesDpu = cyclesDpuBlk * numOfmBlks + cyclesOutputBlk;
 
-    cyclesDpuBlk = std::max(cyclesDpuBlk, cycles_cmd);
-    cyclesOutputBlk = std::max(cyclesOutputBlk, cycles_cmd);
+    cmdCycles = std::max(cmdCycles, cmdCycles2);
+    cyclesDpuBlk = std::max(cyclesDpuBlk, cmdCycles2);
+    cyclesOutputBlk = std::max(cyclesOutputBlk, cmdCycles2);
 
     int64_t totalCycles = 0;
     if ( cyclesDpuBlk > cyclesOutputBlk )
     {
-        totalCycles = cyclesDpuBlk * numOfmBlks + cyclesOutputBlk;
+        totalCycles = int64_t(cyclesDpuBlk * numOfmBlks) + cyclesOutputBlk;
     }
     else
     {
-        totalCycles = cyclesOutputBlk * numOfmBlks + cyclesDpuBlk;
+        totalCycles = int64_t(cyclesOutputBlk * numOfmBlks) + cyclesDpuBlk;
+        cmdCycles = cmdCycles * numOfmBlks + cyclesDpuBlk;
     }
 
-    return totalCycles;
+    return {totalCycles, cyclesDpu, cyclesAO, cmdCycles};
 }
 
 static int EstimateMemoryTransfer(int cores, bool isRead, ArchitectureMemory *memory, TensorFormat format,
@@ -350,7 +396,7 @@ int64_t EthosU55Performance::EstimateMinimumMemoryCycles(const PerformanceQuery
 }
 
 
-float EthosU55Performance::EstimateOutputCyclesPerElement(const PerformanceQuery &query, const std::vector<FusionQuery> &fused)
+EthosU55ElementCycles EthosU55Performance::EstimateOutputCyclesPerElement(const PerformanceQuery &query, const std::vector<FusionQuery> &fused)
 {
     EthosU55OpConfig *opConfig = static_cast<EthosU55OpConfig *>(query.config);
     auto npuOp = _arch->GetHWOp(query.type);
@@ -413,16 +459,18 @@ float EthosU55Performance::EstimateOutputCyclesPerElement(const PerformanceQuery
     }
 
     float cyclesPerElement = std::max(_perfInfo->outputCycles[outputPerfIndex], _perfInfo->activationCycles[activationPerfIndex]);
-
+    float cycleCmd = 0;
+    float aoCyclesPerElement = cyclesPerElement;
     if ( npuOp == EthosU55NpuOp::Elementwise )
     {
         int numElemsBlk = opConfig->OfmBlock().Elements();
         assert(numElemsBlk > 0);
-        float cycleCmd = (float(EstimateMinimumMemoryCycles(query)) / float(numElemsBlk) + cyclesPerElement) / 4.0f;  // per DPU
+        cycleCmd = (float(EstimateMinimumMemoryCycles(query)) / float(numElemsBlk) + cyclesPerElement) / 4.0f;  // per
+                                                                                                                // DPU
         cyclesPerElement = std::max(cyclesPerElement, cycleCmd);
     }
 
-    return cyclesPerElement;
+    return {cyclesPerElement, aoCyclesPerElement, cycleCmd};
 }
 
 ElementAccess EthosU55Performance::MeasureElementAccess(const PerformanceQuery &query)
@@ -558,8 +606,14 @@ ElementAccess EthosU55Performance::ElementTransferToBytes(const PerformanceQuery
 }
 
 int64_t EthosU55Performance::WeightDecodeCycles(
-    const PerformanceQuery &, const WeightStats &weights, Flags<WeightFormat>, ArchitectureMemory *weightsMemory)
+    const PerformanceQuery &, const WeightStats &weights, Flags<WeightFormat> format, ArchitectureMemory *weightsMemory)
 {
+    if ( _db && _nextId != -1 )
+    {
+        assert(_wdTable != -1);
+        _db->AddRow(_wdTable, _nextId, {""});
+        _nextId = -1;
+    }
     int64_t dmaCycles = int64_t(float(weights.encodedSize) / weightsMemory->Bandwidth());
     dmaCycles += weightsMemory->ReadLatency();
     return dmaCycles;
@@ -570,4 +624,39 @@ float EthosU55Performance::ChannelBW(const ArchitectureMemory *mem, const MemCha
     UNUSED(channel);
     return mem->Bandwidth();
 }
+
+void EthosU55Performance::InitDatabase(Database *optDB)
+{
+    _db = optDB;
+    _mainTable = _db->AddTable("perf_debug_main");
+    _wdTable = _db->AddTable("perf_debug_wd");
+
+    std::vector<std::string> columns = {
+        "mac_cycles",
+        "ao_cycles",
+        "cmd_cycles",
+        "traversal",
+    };
+
+    std::vector<std::string> shapes = {"ifm_block", "ofm_block"};
+
+    for ( auto &shape : shapes )
+    {
+        columns.push_back(shape + "_n");
+        columns.push_back(shape + "_h");
+        columns.push_back(shape + "_w");
+        columns.push_back(shape + "_c");
+    }
+    _db->AddColumns(_mainTable, std::move(columns));
+    _db->AddColumns(_wdTable, {"wd_cycles"});
+}
+
+void EthosU55Performance::RecordToDB(int opId)
+{
+    if ( _db )
+    {
+        _nextId = opId;
+    }
+}
+
 }  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp
index e9839b07..c3b8d2a5 100644
--- a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp
@@ -33,6 +33,21 @@ struct EthosU55PerfInfo
     float activationCycles[3];
 };
 
+struct EthosU55Cycles
+{
+    int64_t cycles;
+    int64_t macCycles;
+    int64_t aoCycles;
+    int64_t cmdCycles;
+};
+
+struct EthosU55ElementCycles
+{
+    float cycles;
+    float aoCycles;
+    float cmdCycles;
+};
+
 /// <summary>
 /// Profiles performance analysis for Ethos-U55
 /// </summary>
@@ -41,6 +56,10 @@ class EthosU55Performance : public ArchitecturePerformance
 protected:
     ArchEthosU55 *_arch;
     const EthosU55PerfInfo *_perfInfo;
+    Database *_db = nullptr;
+    int _nextId = -1;
+    int _mainTable = -1;
+    int _wdTable = -1;
 
 public:
     EthosU55Performance(ArchEthosU55 *arch, const EthosU55PerfInfo *perfInfo);
@@ -54,10 +73,12 @@ public:
     int64_t WeightDecodeCycles(const PerformanceQuery &query, const WeightStats &weights, Flags<WeightFormat> format,
         ArchitectureMemory *weightsMemory) override;
     float ChannelBW(const ArchitectureMemory *mem, MemChannel channel) override;
+    void InitDatabase(Database *optDB) override;
+    void RecordToDB(int opId) override;
 
 private:
-    int64_t EstimateConvCycles(const PerformanceQuery &query, const std::vector<FusionQuery> &fused);
-    float EstimateOutputCyclesPerElement(const PerformanceQuery &query, const std::vector<FusionQuery> &fused);
+    EthosU55Cycles EstimateConvCycles(const PerformanceQuery &query, const std::vector<FusionQuery> &fused);
+    EthosU55ElementCycles EstimateOutputCyclesPerElement(const PerformanceQuery &query, const std::vector<FusionQuery> &fused);
     int64_t EstimateMinimumMemoryCycles(const PerformanceQuery &query);
 };
 
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp
index db4aa585..a6785cf1 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp
@@ -22,6 +22,7 @@
 #include "common/logging.hpp"
 
 #include "architecture/architecture.hpp"
+#include "compiler/shape_util.hpp"
 #include "ethos_u85.hpp"
 
 namespace regor
@@ -63,8 +64,11 @@ CycleCost EthosU85Performance::MeasureCycleCostForSparsity(const PerformanceQuer
 CycleCost EthosU85Performance::MeasureCycleCost(const PerformanceQuery &query, const std::vector<FusionQuery> &fused)
 {
     CycleCost cycles;
+    EthosU85Cycles cycleComponents;
+
     auto npuOp = _arch->GetHWOp(query.type);
     const bool sparse = query.weightFormat & WeightFormat::Sparse2_4;
+    const bool recordToDb = _db && _nextId != -1;
     // Convolution/Vector product cycle calculation
     if ( OpUsesMacs(npuOp) )
     {
@@ -79,14 +83,21 @@ CycleCost EthosU85Performance::MeasureCycleCost(const PerformanceQuery &query, c
         }
         cycles.macs /= sparse ? 2 : 1;
 
-        cycles.opCycles = EstimateConvCycles(query, fused);
+        cycleComponents = EstimateConvCycles(query, fused);
+        cycles.opCycles = cycleComponents.cycles;
     }
     // Elementwise cycle calculation
     else if ( npuOp == EthosU85NpuOp::Elementwise )
     {
+        auto [totCCPerElem, aoCCPerElem, cmdCCPerElem] = EstimateOutputCyclesPerElement(query, fused);
         auto ofmShape =
             (query.ofmFormat == TensorFormat::NHCWB16) ? Shape::RoundAway(query.ofmShape, Shape(1, 1, 1, 16)) : query.ofmShape;
-        cycles.opCycles = int64_t(EstimateOutputCyclesPerElement(query, fused) * float(ofmShape.Elements()));
+        cycles.opCycles = int64_t(totCCPerElem * float(ofmShape.Elements()));
+        if ( recordToDb )
+        {
+            cycleComponents.aoCycles = int64_t(aoCCPerElem * float(ofmShape.Elements()));
+            cycleComponents.cmdCycles = int64_t(cmdCCPerElem * float(ofmShape.Elements()));
+        }
     }
     // Resize cycle calculation
     else if ( npuOp == EthosU85NpuOp::Resize )
@@ -108,6 +119,32 @@ CycleCost EthosU85Performance::MeasureCycleCost(const PerformanceQuery &query, c
         assert(false && "Unknown operator cycle costing");
     }
 
+    if ( recordToDb )
+    {
+        assert(_mainTable != -1);
+        EthosU85OpConfig *opConfig = static_cast<EthosU85OpConfig *>(query.config);
+
+        std::vector<std::string> row = {
+            OpUsesMacs(npuOp) ? std::to_string(cycleComponents.macCycles) : "",
+            std::to_string(cycleComponents.aoCycles),
+            std::to_string(cycleComponents.cmdCycles),
+            opConfig ? EnumToString(opConfig->Traversal()) : "",
+        };
+        auto shapeToStrings = [&row](const std::vector<int> &shape)
+        {
+            std::transform(shape.begin(), shape.end(), std::back_inserter(row),
+                [](int n) -> std::string { return n ? std::to_string(n) : ""; });
+        };
+
+
+        shapeToStrings(ReshapeToNHWC(opConfig ? opConfig->IfmBlock() : Shape()).ToList<int>());
+        shapeToStrings(ReshapeToNHWC(opConfig ? opConfig->OfmBlock() : Shape()).ToList<int>());
+        shapeToStrings(ReshapeToNHWC(opConfig ? opConfig->OfmUBlock() : Shape()).ToList<int>());
+
+        _db->AddRow(_mainTable, _nextId, std::move(row));
+        _nextId = -1;
+    }
+
     return cycles;
 }
 
@@ -121,7 +158,7 @@ int64_t EthosU85Performance::MemToMemCycles(const ArchitectureMemory *dest, cons
     return std::max(fromCycles, toCycles);
 }
 
-int64_t EthosU85Performance::EstimateConvCycles(const PerformanceQuery &query, const std::vector<FusionQuery> &fused)
+EthosU85Cycles EthosU85Performance::EstimateConvCycles(const PerformanceQuery &query, const std::vector<FusionQuery> &fused)
 {
     EthosU85OpConfig *opConfig = static_cast<EthosU85OpConfig *>(query.config);
     auto npuOp = _arch->GetHWOp(query.type);
@@ -228,7 +265,10 @@ int64_t EthosU85Performance::EstimateConvCycles(const PerformanceQuery &query, c
     {
         numOfmBlks *= std::max(static_cast<float>(query.ofmShape[i]) / ofmBlock[i], 1.0f);
     }
-    int64_t cyclesOutputBlk = int64_t(EstimateOutputCyclesPerElement(query, fused) * float(ofmBlock.Elements()));
+    auto [totCCPerElem, aoCCPerElem, cmdCCPerElem] = EstimateOutputCyclesPerElement(query, fused);
+    auto aoCycles = int64_t(aoCCPerElem * float(ofmBlock.Elements()));
+    auto cmdCycles = int64_t(cmdCCPerElem * float(ofmBlock.Elements()));
+    auto cyclesOutputBlk = int64_t(totCCPerElem * float(ofmBlock.Elements()));
 
     // Scale and bias tensor
     if ( query.constShape.Size() > 0 && query.constShape.Depth() > 0 )
@@ -237,11 +277,15 @@ int64_t EthosU85Performance::EstimateConvCycles(const PerformanceQuery &query, c
         cyclesOutputBlk = std::max(cyclesOutputBlk, int64_t(cyclesBiasBlk));
     }
 
-    int64_t cycles_cmd = EstimateMinimumMemoryCycles(query);
-    cycles_cmd = (cycles_cmd + cyclesOutputBlk + cyclesDpuBlk) / 4;  // Per DPU
+    int64_t cmdCycles2 = EstimateMinimumMemoryCycles(query);
+    cmdCycles2 = (cmdCycles2 + cyclesOutputBlk + cyclesDpuBlk) / 4;  // Per DPU
+
+    int64_t cyclesAO = aoCycles * numOfmBlks + cyclesDpuBlk;
+    int64_t cyclesDpu = cyclesDpuBlk * numOfmBlks + cyclesOutputBlk;
 
-    cyclesDpuBlk = std::max(cyclesDpuBlk, cycles_cmd);
-    cyclesOutputBlk = std::max(cyclesOutputBlk, cycles_cmd);
+    cmdCycles = std::max(cmdCycles, cmdCycles2);
+    cyclesDpuBlk = std::max(cyclesDpuBlk, cmdCycles2);
+    cyclesOutputBlk = std::max(cyclesOutputBlk, cmdCycles2);
 
     int64_t totalCycles = 0;
     if ( cyclesDpuBlk > cyclesOutputBlk )
@@ -251,9 +295,10 @@ int64_t EthosU85Performance::EstimateConvCycles(const PerformanceQuery &query, c
     else
     {
         totalCycles = int64_t(cyclesOutputBlk * numOfmBlks) + cyclesDpuBlk;
+        cmdCycles = cmdCycles * numOfmBlks + cyclesDpuBlk;
     }
 
-    return totalCycles;
+    return {totalCycles, cyclesDpu, cyclesAO, cmdCycles};
 }
 
 static int64_t EstimateMemoryTransfer(int cores, bool isRead, ArchitectureMemory *memory, TensorFormat format,
@@ -342,7 +387,7 @@ int64_t EthosU85Performance::EstimateMinimumMemoryCycles(const PerformanceQuery
 }
 
 
-float EthosU85Performance::EstimateOutputCyclesPerElement(const PerformanceQuery &query, const std::vector<FusionQuery> &fused)
+EthosU85ElementCycles EthosU85Performance::EstimateOutputCyclesPerElement(const PerformanceQuery &query, const std::vector<FusionQuery> &fused)
 {
     EthosU85OpConfig *opConfig = static_cast<EthosU85OpConfig *>(query.config);
     auto npuOp = _arch->GetHWOp(query.type);
@@ -379,16 +424,18 @@ float EthosU85Performance::EstimateOutputCyclesPerElement(const PerformanceQuery
     }
 
     float cyclesPerElement = std::max(_perfInfo->outputCycles[outputPerfIndex], _perfInfo->activationCycles[activationPerfIndex]);
-
+    float cycleCmd = 0;
+    float aoCyclesPerElement = cyclesPerElement;
     if ( npuOp == EthosU85NpuOp::Elementwise )
     {
         int numElemsBlk = opConfig->OfmBlock().Elements();
         assert(numElemsBlk > 0);
-        float cycleCmd = (float(EstimateMinimumMemoryCycles(query)) / float(numElemsBlk) + cyclesPerElement) / 4.0f;  // per DPU
+        cycleCmd = (float(EstimateMinimumMemoryCycles(query)) / float(numElemsBlk) + cyclesPerElement) / 4.0f;  // per
+                                                                                                                // DPU
         cyclesPerElement = std::max(cyclesPerElement, cycleCmd);
     }
 
-    return cyclesPerElement;
+    return {cyclesPerElement, aoCyclesPerElement, cycleCmd};
 }
 
 ElementAccess EthosU85Performance::MeasureElementAccess(const PerformanceQuery &query)
@@ -563,6 +610,12 @@ int64_t EthosU85Performance::WeightDecodeCycles(
         weightsPerCycle = weightsPerCore * _arch->_cores;
     }
     int64_t decodeCycles = weights.size / weightsPerCycle;
+    if ( _db && _nextId != -1 )
+    {
+        assert(_wdTable != -1);
+        _db->AddRow(_wdTable, _nextId, {std::to_string(decodeCycles)});
+        _nextId = -1;
+    }
 
     MemChannel channel = (format & WeightFormat::Fast) ? MemChannel::FastWeight : MemChannel::Weight;
     int64_t dmaCycles = int64_t(float(weights.encodedSize) / ChannelBW(weightsMemory, channel));
@@ -598,4 +651,38 @@ float EthosU85Performance::ChannelBW(const ArchitectureMemory *mem, const MemCha
     return channelBW;
 }
 
+void EthosU85Performance::InitDatabase(Database *optDB)
+{
+    _db = optDB;
+    _mainTable = _db->AddTable("perf_debug_main");
+    _wdTable = _db->AddTable("perf_debug_wd");
+
+    std::vector<std::string> columns = {
+        "mac_cycles",
+        "ao_cycles",
+        "cmd_cycles",
+        "traversal",
+    };
+
+    std::vector<std::string> shapes = {"ifm_block", "ofm_block", "ofm_ublock"};
+
+    for ( auto &shape : shapes )
+    {
+        columns.push_back(shape + "_n");
+        columns.push_back(shape + "_h");
+        columns.push_back(shape + "_w");
+        columns.push_back(shape + "_c");
+    }
+    _db->AddColumns(_mainTable, std::move(columns));
+    _db->AddColumns(_wdTable, {"wd_cycles"});
+}
+
+void EthosU85Performance::RecordToDB(int opId)
+{
+    if ( _db )
+    {
+        _nextId = opId;
+    }
+}
+
 }  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp
index 6c7faacd..a048e99e 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp
@@ -33,6 +33,21 @@ struct EthosU85PerfInfo
     float activationCycles[3];
 };
 
+struct EthosU85Cycles
+{
+    int64_t cycles = 0;
+    int64_t macCycles = 0;
+    int64_t aoCycles = 0;
+    int64_t cmdCycles = 0;
+};
+
+struct EthosU85ElementCycles
+{
+    float cycles;
+    float aoCycles;
+    float cmdCycles;
+};
+
 /// <summary>
 /// Profiles performance analysis for Ethos-U85
 /// </summary>
@@ -41,6 +56,10 @@ class EthosU85Performance : public ArchitecturePerformance
 protected:
     ArchEthosU85 *_arch;
     const EthosU85PerfInfo *_perfInfo;
+    Database *_db = nullptr;
+    int _nextId = -1;
+    int _mainTable = -1;
+    int _wdTable = -1;
 
 public:
     EthosU85Performance(ArchEthosU85 *arch, const EthosU85PerfInfo *perfInfo);
@@ -54,10 +73,12 @@ public:
     int64_t WeightDecodeCycles(const PerformanceQuery &query, const WeightStats &weights, Flags<WeightFormat> format,
         ArchitectureMemory *weightsMemory) override;
     float ChannelBW(const ArchitectureMemory *mem, MemChannel channel) override;
+    void InitDatabase(Database *optDB) override;
+    void RecordToDB(int opId) override;
 
 private:
-    int64_t EstimateConvCycles(const PerformanceQuery &query, const std::vector<FusionQuery> &fused);
-    float EstimateOutputCyclesPerElement(const PerformanceQuery &query, const std::vector<FusionQuery> &fused);
+    EthosU85Cycles EstimateConvCycles(const PerformanceQuery &query, const std::vector<FusionQuery> &fused);
+    EthosU85ElementCycles EstimateOutputCyclesPerElement(const PerformanceQuery &query, const std::vector<FusionQuery> &fused);
     int64_t EstimateMinimumMemoryCycles(const PerformanceQuery &query);
 };
 
diff --git a/ethosu/regor/compiler/network_performance.cpp b/ethosu/regor/compiler/network_performance.cpp
index 12818c13..b046c862 100644
--- a/ethosu/regor/compiler/network_performance.cpp
+++ b/ethosu/regor/compiler/network_performance.cpp
@@ -20,6 +20,7 @@
 
 #include "common/common.hpp"
 
+#include "compiler/shape_util.hpp"
 #include "database.hpp"
 #include "graph_optimiser.hpp"
 
@@ -50,13 +51,14 @@ PerformanceResult NetworkPerformance::Measure(Schedule *schedule, OptimiserDatab
         _arch->LUTMemory().memory, _arch->StagingMemory().memory});
     std::unordered_set<MemArea, MemArea::hash> regions(
         {_arch->ReadonlyMemory(), _arch->FeatureMapMemory(), _arch->LUTMemory(), _arch->StagingMemory()});
-    int opTable = 0;
-    int opTableColumnCount = 0;
     std::unordered_set<UniqueId> tensorUids;
+    int opTable = 0;
+    int perfDebugTable = 0;
 
     if ( optDb )
     {
         db = optDb->Get();
+        _arch->Performance()->InitDatabase(db);
         opTable = db->AddTable("perf");
         std::vector<std::string> columns = {
             "source_id",
@@ -74,7 +76,102 @@ PerformanceResult NetworkPerformance::Measure(Schedule *schedule, OptimiserDatab
             columns.push_back(label);
         }
         db->AddColumns(opTable, columns);
-        opTableColumnCount = int(columns.size());
+
+        perfDebugTable = db->AddTable("perf_debug");
+
+        columns = {};
+        const std::vector<std::string> shapeColumns = {
+            "ifm_shape",
+            "ifm2_shape",
+            "ofm_shape",
+            "ifm_slice",
+            "ifm2_slice",
+            "ofm_slice",
+            "ifm_stripe",
+            "ifm2_stripe",
+            "ofm_stripe",
+        };
+
+        for ( auto &shape : shapeColumns )
+        {
+            columns.push_back(shape + "_n");
+            columns.push_back(shape + "_h");
+            columns.push_back(shape + "_w");
+            columns.push_back(shape + "_c");
+        }
+
+        columns.insert(columns.end(),
+            {
+                "ifm_memory",
+                "ifm2_memory",
+                "ofm_memory",
+                "ifm_format",
+                "ifm2_format",
+                "ofm_format",
+                "ifm_dtype",
+                "ifm2_dtype",
+                "ofm_dtype",
+                "ifm_pre_buffering",
+                "ifm2_pre_buffering",
+                "ifm_buffering",
+                "ifm2_buffering",
+                "reverse_type",
+                "transpose_type",
+                "time_index",
+                "cascade",
+                "weight_format",
+                "weight_dtype",
+                "weight_total_bytes",
+                "weight_max_range_bytes",
+                "weight_sub_streams",
+                "weight_distinct",
+                "weight_zero",
+                "scales_dtype",
+                "scales_total_bytes",
+                "scales_max_range_bytes",
+                "ofm_depth_slices",
+                "weight_pre_buffer",
+                "weight_buffering",
+                "weight_transfer_cycles",
+                "kernel_depth_multiplier",
+            });
+
+        columns.emplace_back("kernel_padding_T");
+        columns.emplace_back("kernel_padding_B");
+        columns.emplace_back("kernel_padding_L");
+        columns.emplace_back("kernel_padding_R");
+        columns.emplace_back("kernel_padding_N");
+        columns.emplace_back("kernel_padding_F");
+
+        const std::vector<std::string> xyzColumns = {
+            "kernel_size",
+            "kernel_dilation",
+            "kernel_stride",
+        };
+        for ( auto &xyzCol : xyzColumns )
+        {
+            columns.push_back(xyzCol + "_x");
+            columns.push_back(xyzCol + "_y");
+            columns.push_back(xyzCol + "_z");
+        }
+
+        for ( const auto &mem : memories )
+        {
+            columns.push_back(mem->Name() + EnumToString(AccessType::Lut) + "_ac");
+            columns.push_back(mem->Name() + EnumToString(AccessType::Lut) + "_read");
+            columns.push_back(mem->Name() + EnumToString(AccessType::Lut) + "_write");
+            columns.push_back(mem->Name() + EnumToString(AccessType::FeatureMap) + "_ac");
+            columns.push_back(mem->Name() + EnumToString(AccessType::FeatureMap) + "_read");
+            columns.push_back(mem->Name() + EnumToString(AccessType::FeatureMap) + "_write");
+            columns.push_back(mem->Name() + EnumToString(AccessType::Weights) + "_ac");
+            columns.push_back(mem->Name() + EnumToString(AccessType::Weights) + "_read");
+            columns.push_back(mem->Name() + EnumToString(AccessType::Weights) + "_write");
+            columns.push_back(mem->Name() + EnumToString(AccessType::Scales) + "_ac");
+            columns.push_back(mem->Name() + EnumToString(AccessType::Scales) + "_read");
+            columns.push_back(mem->Name() + EnumToString(AccessType::Scales) + "_write");
+        }
+
+        db->AddColumns(perfDebugTable, std::move(columns));
     }
 
     for ( auto const &schedOp : _ops )
@@ -97,7 +194,7 @@ PerformanceResult NetworkPerformance::Measure(Schedule *schedule, OptimiserDatab
         }
         if ( optDb != nullptr )
         {
-            AddToDatabase(perf, schedOp.get(), opTable, opTableColumnCount, memories, optDb);
+            AddToDatabase(perf, schedOp.get(), cost, opTable, perfDebugTable, memories, optDb);
         }
         performance += perf;
         prevOp = schedOp.get();
@@ -109,7 +206,7 @@ PerformanceResult NetworkPerformance::Measure(Schedule *schedule, OptimiserDatab
             perf = ProcessOpPerformance(subOp.get(), cost, schedule, prevOp, prevCost, memories);
             if ( optDb != nullptr )
             {
-                AddToDatabase(perf, subOp.get(), opTable, opTableColumnCount, memories, optDb);
+                AddToDatabase(perf, subOp.get(), cost, opTable, perfDebugTable, memories, optDb);
             }
             performance += perf;
             prevOp = subOp.get();
@@ -155,8 +252,8 @@ PerformanceResult NetworkPerformance::ProcessOpPerformance(SchedulerOperation *s
 }
 
 
-void NetworkPerformance::AddToDatabase(const PerformanceResult &perf, SchedulerOperation *schedOp, int opTable,
-    int /*opTableColumnCount*/, const std::unordered_set<ArchitectureMemory *> &memories, OptimiserDatabase *optDb)
+void NetworkPerformance::AddToDatabase(const PerformanceResult &perf, SchedulerOperation *schedOp, SchedulerOpInfo *cost,
+    int opTable, int perfDebugTable, const std::unordered_set<ArchitectureMemory *> &memories, OptimiserDatabase *optDb)
 {
     // Per-layer calculations
     assert(optDb != nullptr);
@@ -188,7 +285,110 @@ void NetworkPerformance::AddToDatabase(const PerformanceResult &perf, SchedulerO
         row.push_back(std::to_string(perf.memory.at(mem).AccessCycles()));
     }
 
-    db->AddRow(opTable, schedOp->Index(), std::move(row));
+    db->AddRow(opTable, schedOp->Uid(), std::move(row));
+
+    row = {};
+    auto shapeToStrings = [&row](const std::vector<int> &shape)
+    {
+        std::transform(shape.begin(), shape.end(), std::back_inserter(row),
+            [](int n) -> std::string { return n ? std::to_string(n) : ""; });
+    };
+    // clang-format off
+    // FM shapes
+    shapeToStrings(ReshapeToNHWC(schedOp->IFM(0)->shape).ToList<int>());
+    shapeToStrings(ReshapeToNHWC(schedOp->TryIFM(1) ? schedOp->IFM(1)->shape : Shape()).ToList<int>());
+    shapeToStrings(ReshapeToNHWC(schedOp->OFM()->shape).ToList<int>());
+    // Slice shapes
+    shapeToStrings(ReshapeToNHWC(schedOp->IFM(0)->slice.shape).ToList<int>());
+    shapeToStrings(ReshapeToNHWC(schedOp->TryIFM(1) ? schedOp->IFM(1)->slice.shape : Shape()).ToList<int>());
+    shapeToStrings(ReshapeToNHWC(schedOp->OFM()->slice.shape).ToList<int>());
+    // Stripe shapes
+    shapeToStrings(ReshapeToNHWC(cost->stripeInput[0]).ToList<int>());
+    shapeToStrings(ReshapeToNHWC(schedOp->TryIFM(1) ? cost->stripeInput[1] : Shape()).ToList<int>());
+    shapeToStrings(ReshapeToNHWC(cost->stripe).ToList<int>());
+
+    row.insert(row.end(), {
+        // FM Memory
+        fmt::format("{}", schedOp->IFM(0)->tensor->memArea.memory->Name()),
+        fmt::format("{}", schedOp->TryIFM(1) ? schedOp->IFM(1)->tensor->memArea.memory->Name() : ""),
+        fmt::format("{}", schedOp->OFM()->tensor->memArea.memory->Name()),
+        // Formats
+        fmt::format("{}", EnumToString(schedOp->IFM(0)->tensor->format)),
+        fmt::format("{}", schedOp->TryIFM(1) ? EnumToString(schedOp->IFM(1)->tensor->format) : ""),
+        fmt::format("{}", EnumToString(schedOp->OFM()->tensor->format)),
+        // Data types
+        fmt::format("{}", EnumToString(schedOp->IFM(0)->tensor->dataType)),
+        fmt::format("{}", schedOp->TryIFM(1) ? EnumToString(schedOp->IFM(1)->tensor->dataType) : ""),
+        fmt::format("{}", EnumToString(schedOp->OFM()->tensor->dataType)),
+        // IFM Buffering
+        std::to_string(schedOp->IFM(0)->preBuffer),
+        schedOp->TryIFM(1) ? std::to_string(schedOp->IFM(1)->preBuffer) : "",
+        EnumToString(schedOp->IFM(0)->buffering),
+        schedOp->TryIFM(1) ? EnumToString(schedOp->IFM(1)->buffering) : "",
+        // Transpose and Reverse Types
+        EnumToString(schedOp->OFM()->transpose),
+        EnumToString(schedOp->OFM()->reverse),
+        // Timeindex
+        std::to_string(cost->timeIndex),
+        // Cascade
+        std::to_string(cost->cascade),
+        // Weights
+        cost->npuWeightsTensor ? cost->npuWeightsTensor->config->Format().ToString() : "",
+        cost->npuWeightsTensor ? EnumToString(cost->npuWeightsTensor->dataType) : "",
+        cost->npuWeightsTensor ? std::to_string(cost->npuWeightsTensor->totalWeightBytes) : "",
+        cost->npuWeightsTensor ? std::to_string(cost->npuWeightsTensor->maxRangeBytes) : "",
+        cost->npuWeightsTensor ? std::to_string(cost->npuWeightsTensor->subStreams) : "",
+        cost->npuWeightsTensor ? std::to_string(cost->npuWeightsTensor->distinctWeights) : "",
+        cost->npuWeightsTensor ? std::to_string(cost->npuWeightsTensor->zeroCount) : "",
+        // Scales
+        cost->npuScalesTensor ? EnumToString(cost->npuScalesTensor->dataType) : "",
+        cost->npuScalesTensor ? std::to_string(cost->npuScalesTensor->totalWeightBytes) : "",
+        cost->npuScalesTensor ? std::to_string(cost->npuScalesTensor->maxRangeBytes) : "",
+        // Weight Buffering
+        fmt::format("{}", fmt::join(cost->ofmDepthSlices, "|")),
+        cost->bufferedWeightTensor.tensor ? std::to_string(cost->bufferedWeightTensor.preBuffer) : "",
+        cost->bufferedWeightTensor.tensor ? EnumToString(cost->bufferedWeightTensor.buffering) : "",
+        cost->bufferedWeightTensor.tensor ? std::to_string(cost->fullWeightTransferCycles) : "",
+        // Kernel
+        std::to_string(schedOp->Kernel()->DepthMultiplier()),
+        std::to_string(schedOp->Kernel()->Padding().Top()),
+        std::to_string(schedOp->Kernel()->Padding().Bottom()),
+        std::to_string(schedOp->Kernel()->Padding().Left()),
+        std::to_string(schedOp->Kernel()->Padding().Right()),
+        std::to_string(schedOp->Kernel()->Padding().Near()),
+        std::to_string(schedOp->Kernel()->Padding().Far()),
+        std::to_string(schedOp->Kernel()->Size3D().x),
+        std::to_string(schedOp->Kernel()->Size3D().y),
+        std::to_string(schedOp->Kernel()->Size3D().z),
+        std::to_string(schedOp->Kernel()->Dilation3D().x),
+        std::to_string(schedOp->Kernel()->Dilation3D().y),
+        std::to_string(schedOp->Kernel()->Dilation3D().z),
+        std::to_string(schedOp->Kernel()->Stride3D().x),
+        std::to_string(schedOp->Kernel()->Stride3D().y),
+        std::to_string(schedOp->Kernel()->Stride3D().z),
+    });
+    // clang-format on
+    for ( const auto mem : memories )
+    {
+        // For all usages, add access read and access write:
+        for ( int i = 0; i < int(AccessType::Last); i++ )
+        {
+            if ( perf.memory.at(mem).access.find(static_cast<AccessType>(i)) != perf.memory.at(mem).access.end() )
+            {
+                row.push_back(std::to_string(perf.memory.at(mem).access.at(static_cast<AccessType>(i)).accessCycles));
+                row.push_back(std::to_string(perf.memory.at(mem).access.at(static_cast<AccessType>(i)).bytesRead));
+                row.push_back(std::to_string(perf.memory.at(mem).access.at(static_cast<AccessType>(i)).bytesWritten));
+            }
+            else
+            {
+                row.emplace_back("");
+                row.emplace_back("");
+                row.emplace_back("");
+            }
+        }
+    }
+
+    db->AddRow(perfDebugTable, schedOp->Uid(), std::move(row));
 }
 
 
@@ -200,8 +400,23 @@ PerformanceResult NetworkPerformance::EstimateFullOpPerformance(
     PerformanceQuery query = Scheduler::InitPerfQuery(schedOp, cost->Config(), -1, wgtFormat);
     std::vector<FusionQuery> fused = Scheduler::InitFusionQuery(schedOp);
 
+    // Memory that NPU will source weights from for operations
+    ArchitectureMemory *weightsMemory = cost->npuWeightsTensor ? cost->npuWeightsTensor->memArea.memory : nullptr;
+
+    _arch->Performance()->RecordToDB(schedOp->Uid());
     CycleCost cycles = _arch->Performance()->MeasureCycleCost(query, fused);
 
+    if ( cost->npuWeightsTensor )
+    {
+        WeightStats weightStats;
+        weightStats.size = cost->npuWeightsTensor->totalSourceBytes;
+        weightStats.encodedSize = cost->npuWeightsTensor->totalWeightBytes;
+        weightStats.zeroCount = cost->npuWeightsTensor->zeroCount;
+        weightStats.distinctWeights = cost->npuWeightsTensor->distinctWeights;
+        _arch->Performance()->RecordToDB(schedOp->Uid());
+        _arch->Performance()->WeightDecodeCycles(query, weightStats, query.weightFormat, weightsMemory);
+    }
+
     PerformanceResult result;
     result.npuCycles = cycles.opCycles;
     result.macCount = cycles.macs;
@@ -238,9 +453,6 @@ PerformanceResult NetworkPerformance::EstimateFullOpPerformance(
         }
     }
 
-    // Memory that NPU will source weights from for operations
-    ArchitectureMemory *weightsMemory = cost->npuWeightsTensor ? cost->npuWeightsTensor->memArea.memory : nullptr;
-
     if ( weightsMemory && cost->bufferedWeightTensor.tensor )
     {
         // DMA Weight Transfer
diff --git a/ethosu/regor/compiler/network_performance.hpp b/ethosu/regor/compiler/network_performance.hpp
index 6e0b942c..88f2be0b 100644
--- a/ethosu/regor/compiler/network_performance.hpp
+++ b/ethosu/regor/compiler/network_performance.hpp
@@ -32,12 +32,13 @@ namespace regor
 /// <summary>
 /// Performance information for a whole schedule
 /// </summary>
-enum AccessType
+enum class AccessType
 {
     Lut = 0,
     FeatureMap = 1,
     Weights = 2,
     Scales = 3,
+    Last,
 };
 
 struct PerformanceResult
@@ -146,8 +147,8 @@ private:
         SchedulerOperation *prevOp, SchedulerOpInfo *prevCost, const std::unordered_set<ArchitectureMemory *> &memories);
     PerformanceResult EstimateFullOpPerformance(
         SchedulerOperation *schedOp, SchedulerOpInfo *cost, SchedulerOperation *prevOp, SchedulerOpInfo *prevCost);
-    void AddToDatabase(const PerformanceResult &perf, SchedulerOperation *schedOp, int opTable, int columns,
-        const std::unordered_set<ArchitectureMemory *> &memories, OptimiserDatabase *optDb);
+    void AddToDatabase(const PerformanceResult &perf, SchedulerOperation *schedOp, SchedulerOpInfo *cost, int opTable,
+        int perfDebugTable, const std::unordered_set<ArchitectureMemory *> &memories, OptimiserDatabase *optDb);
 };
 
 
diff --git a/ethosu/regor/compiler/scheduler_operation.cpp b/ethosu/regor/compiler/scheduler_operation.cpp
new file mode 100644
index 00000000..c00b9eba
--- /dev/null
+++ b/ethosu/regor/compiler/scheduler_operation.cpp
@@ -0,0 +1,29 @@
+//
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "compiler/scheduler_operation.hpp"
+
+#include "common/logging.hpp"
+
+#include "common/bit_flags.hpp"
+
+BEGIN_ENUM_TABLE(regor::Buffering)
+    ADD_ENUM_NAME(None)
+    ADD_ENUM_NAME(Single)
+    ADD_ENUM_NAME(Double)
+END_ENUM_TABLE()
diff --git a/ethosu/regor/compiler/shape_util.hpp b/ethosu/regor/compiler/shape_util.hpp
index 4a695147..5563c5fb 100644
--- a/ethosu/regor/compiler/shape_util.hpp
+++ b/ethosu/regor/compiler/shape_util.hpp
@@ -71,4 +71,17 @@ inline Shape ReshapeTo3DAroundEdges(const Shape &shape, int minAxis = 1)
     return ReshapeTo3D(shape, {1, shape.Size() - 2, 1}, minAxis);
 }
 
+inline Shape ReshapeToNHWC(Shape shape)
+{
+    if ( !shape.IsValid() )
+    {
+        shape = {0, 0, 0, 0};
+    }
+    int batch = shape.AxisProduct(0, shape.Size() - 3);
+    shape = Shape::PadAxes(shape, 4, 1).Extract(0, -3, -2, -1);
+    shape[0] = batch;
+    return shape;
+}
+
+
 }  // namespace regor
-- 
GitLab