From bcb9b7e82fde71574359e121e2f35bc01bc94de0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johan=20Alfv=C3=A9n?= <johan.alfven@arm.com>
Date: Tue, 15 Apr 2025 13:30:39 +0200
Subject: [PATCH] MLBEDSW-10632: [MLCE] Improve weight format selection using
 op perf
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Weight format selection now considers op cycle cost, making Sparse2_4
  more likely to be chosen when it yields real performance gains.

- Modified part kernel branch in EstimateConvCycles to account for
  Sparse2_4.

- Updated EstimateOpPerformance to include weight format.

- Removed EstimateOpPerformanceForSparsity, as it is no longer needed.

Signed-off-by: Johan Alfvén <johan.alfven@arm.com>
Change-Id: Iaa804567bd7f95895ae5ea9524c35479a3854aef
---
 ethosu/regor/architecture/architecture.hpp    |  1 -
 .../ethosu55/ethos_u55_performance.cpp        |  5 ---
 .../ethosu55/ethos_u55_performance.hpp        |  1 -
 .../ethosu85/ethos_u85_performance.cpp        | 10 +-----
 .../ethosu85/ethos_u85_performance.hpp        |  1 -
 ethosu/regor/compiler/scheduler.cpp           | 35 +++++++++----------
 ethosu/regor/compiler/scheduler.hpp           |  7 ++--
 7 files changed, 21 insertions(+), 39 deletions(-)
diff --git a/ethosu/regor/architecture/architecture.hpp b/ethosu/regor/architecture/architecture.hpp
index 02a6b7e2..39654377 100644
--- a/ethosu/regor/architecture/architecture.hpp
+++ b/ethosu/regor/architecture/architecture.hpp
@@ -305,7 +305,6 @@ class ArchitecturePerformance
 public:
     virtual ~ArchitecturePerformance() = default;
     virtual CycleCost MeasureCycleCost(const PerformanceQuery &query, const std::vector<FusionQuery> &fused) = 0;
-    virtual CycleCost MeasureCycleCostForSparsity(const PerformanceQuery &query, const std::vector<FusionQuery> &fused) = 0;
     virtual int64_t MemToMemCycles(const ArchitectureMemory *dest, const ArchitectureMemory *source, int sizeBytes) = 0;
     virtual ElementAccess MeasureElementAccess(const PerformanceQuery &query) = 0;
     virtual ElementAccess ElementTransferToBytes(const PerformanceQuery &query, const ElementAccess &access) = 0;
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp
index 26c83483..9605508a 100644
--- a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp
@@ -49,11 +49,6 @@ EthosU55Performance::EthosU55Performance(ArchEthosU55 *arch, const EthosU55PerfI
     _perfInfo = perfInfo;
 }
 
-CycleCost EthosU55Performance::MeasureCycleCostForSparsity(const PerformanceQuery &query, const std::vector<FusionQuery> &fused)
-{
-    return MeasureCycleCost(query, fused);
-}
-
 CycleCost EthosU55Performance::MeasureCycleCost(const PerformanceQuery &query, const std::vector<FusionQuery> &fused)
 {
     CycleCost cycles;
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp
index 6d50c1d0..9eba2bbc 100644
--- a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp
@@ -67,7 +67,6 @@ public:
 
 public:
     CycleCost MeasureCycleCost(const PerformanceQuery &query, const std::vector<FusionQuery> &fused) override;
-    CycleCost MeasureCycleCostForSparsity(const PerformanceQuery &query, const std::vector<FusionQuery> &fused) override;
     int64_t MemToMemCycles(const ArchitectureMemory *dest, const ArchitectureMemory *source, int sizeBytes) override;
     ElementAccess MeasureElementAccess(const PerformanceQuery &query) override;
     ElementAccess ElementTransferToBytes(const PerformanceQuery &query, const ElementAccess &access) override;
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp
index c4f80795..c1007272 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp
@@ -52,15 +52,6 @@ EthosU85Performance::EthosU85Performance(ArchEthosU85 *arch, const EthosU85PerfI
     _perfInfo = perfInfo;
 }
 
-CycleCost EthosU85Performance::MeasureCycleCostForSparsity(const PerformanceQuery &query, const std::vector<FusionQuery> &)
-{
-    // Temporary until we have a better performance estimate for U85
-    CycleCost cycles;
-    EthosU85OpConfig *opConfig = static_cast<EthosU85OpConfig *>(query.config);
-    cycles.opCycles = opConfig->OptimalDepthGranule();  // Maybe can find a better metric? MLBEDSW-9227
-    return cycles;
-}
-
 CycleCost EthosU85Performance::MeasureCycleCost(const PerformanceQuery &query, const std::vector<FusionQuery> &fused)
 {
     CycleCost cycles;
@@ -211,6 +202,7 @@ EthosU85Cycles EthosU85Performance::EstimateConvCycles(const PerformanceQuery &q
                 numKernelSteps = DivRoundUp(subKernelElements, divider);
                 cycles = std::max(cyclesWb, 4 * numUBlocks.ElementsWH()) * numKernelSteps * numUBlocks.Depth() *
                          DivRoundUp(ifmBlock.Depth(), 8);
+                cycles /= query.weightFormat & WeightFormat::Sparse2_4 ? 2 : 1;
             }
 
             // Calculate delay
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp
index 72cdd42f..5831023a 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp
@@ -67,7 +67,6 @@ public:
 
 public:
     CycleCost MeasureCycleCost(const PerformanceQuery &query, const std::vector<FusionQuery> &fused) override;
-    CycleCost MeasureCycleCostForSparsity(const PerformanceQuery &query, const std::vector<FusionQuery> &fused) override;
     int64_t MemToMemCycles(const ArchitectureMemory *dest, const ArchitectureMemory *source, int sizeBytes) override;
     ElementAccess MeasureElementAccess(const PerformanceQuery &query) override;
     ElementAccess ElementTransferToBytes(const PerformanceQuery &query, const ElementAccess &access) override;
diff --git a/ethosu/regor/compiler/scheduler.cpp b/ethosu/regor/compiler/scheduler.cpp
index 3a8722d3..114254d4 100644
--- a/ethosu/regor/compiler/scheduler.cpp
+++ b/ethosu/regor/compiler/scheduler.cpp
@@ -502,12 +502,14 @@ WeightScaleEncoding ChooseBestWeightFormat(Architecture *arch, SchedulerOperatio
             weightStats.zeroCount = weightTensor->zeroCount;
             weightStats.distinctWeights = weightTensor->distinctWeights;
             auto query = Scheduler::InitPerfQuery(op, nullptr);
-            auto cycles = arch->Performance()->WeightDecodeCycles(
-                query, weightStats, weightTensor->config->Format(), weightTensor->memArea.memory);
-            if ( cycles < minCycles )
+            auto totalCycles =
+                arch->Performance()->WeightDecodeCycles(
+                    query, weightStats, weightTensor->config->Format(), weightTensor->memArea.memory) +
+                encodingResult.cycleCost.opCycles;
+            if ( totalCycles < minCycles )
             {
                 bestResult = &encodingResult;
-                minCycles = cycles;
+                minCycles = totalCycles;
             }
         }
     }
@@ -563,11 +565,13 @@ WeightScaleEncoding Scheduler::EncodeBestWeightFormat(
         _arch, op, ifmShape, ifm2Shape, ofmShape, WF(WeightFormat::Default));
     std::unique_ptr<ArchitectureOpConfig> blockConfigSparse = MaybeGetSparsityConfig(_arch, op, ifmShape, ifm2Shape, ofmShape, supportedFormats);
 
+    CycleCost defaultCycleCost;
+    CycleCost sparseCycleCost;
     if ( blockConfigSparse )
     {
-        auto perfDefault = EstimateOpPerformanceForSparsity(op, blockConfigDefault.get(), op->OFM()->SliceShape().Depth());
-        auto perfSparse = EstimateOpPerformanceForSparsity(op, blockConfigSparse.get(), op->OFM()->SliceShape().Depth());
-        if ( perfSparse.opCycles > perfDefault.opCycles )
+        defaultCycleCost = EstimateOpPerformance(op, blockConfigDefault.get(), op->OFM()->SliceShape().Depth());
+        sparseCycleCost = EstimateOpPerformance(op, blockConfigSparse.get(), op->OFM()->SliceShape().Depth(), WeightFormat::Sparse2_4);
+        if ( sparseCycleCost.opCycles > defaultCycleCost.opCycles )
         {
             supportedFormats.Unset(WeightFormat::Sparse2_4);
         }
@@ -614,6 +618,8 @@ WeightScaleEncoding Scheduler::EncodeBestWeightFormat(
             {
                 supportedFormats.Unset(WeightFormat::Fast);
             }
+            // Sparse2_4 affects opCycles and must be accounted for when selecting the best weight format
+            encoding.cycleCost = (weightFormat % WeightFormat::Sparse2_4) ? sparseCycleCost : defaultCycleCost;
             encodingResults.emplace_back(std::move(encoding));
         }
         catch ( const WeightEncodeException & )
@@ -629,6 +635,7 @@ WeightScaleEncoding Scheduler::EncodeBestWeightFormat(
     auto bestEncoding = ChooseBestWeightFormat(_arch, op, _options.optimizationStrategy, encodingResults);
     bestEncoding.blockConfig =
         (bestEncoding.weightScales.npuWeightsTensor->config->Format() % WeightFormat::Sparse2_4) ? std::move(blockConfigSparse) : std::move(blockConfigDefault);
+
     return bestEncoding;
 }
 
@@ -1742,7 +1749,7 @@ std::vector<FusionQuery> Scheduler::InitFusionQuery(SchedulerOperation *op)
 }
 
 
-CycleCost Scheduler::EstimateOpPerformance(SchedulerOperation *op, ArchitectureOpConfig *config, int ofm_depth)
+CycleCost Scheduler::EstimateOpPerformance(SchedulerOperation *op, ArchitectureOpConfig *config, int ofm_depth, WeightFormat wgtFormat)
 {
     CycleCost cycleCost;
     if ( !op->IsNpuOp() )
@@ -1751,22 +1758,12 @@ CycleCost Scheduler::EstimateOpPerformance(SchedulerOperation *op, ArchitectureO
         return cycleCost;
     }
 
-    PerformanceQuery query = InitPerfQuery(op, config, ofm_depth);
+    PerformanceQuery query = InitPerfQuery(op, config, ofm_depth, wgtFormat);
     std::vector<FusionQuery> fused = InitFusionQuery(op);
     cycleCost = _arch->Performance()->MeasureCycleCost(query, fused);
     return cycleCost;
 }
 
-CycleCost Scheduler::EstimateOpPerformanceForSparsity(SchedulerOperation *op, ArchitectureOpConfig *config, int ofm_depth)
-{
-    CycleCost cycleCost;
-    assert(op->IsNpuOp());
-
-    PerformanceQuery query = InitPerfQuery(op, config, ofm_depth);
-    std::vector<FusionQuery> fused = InitFusionQuery(op);
-    cycleCost = _arch->Performance()->MeasureCycleCostForSparsity(query, fused);
-    return cycleCost;
-}
 
 ElementAccess Scheduler::EstimateOpElementAccess(SchedulerOperation *op, ArchitectureOpConfig *config, int ofm_depth)
 {
diff --git a/ethosu/regor/compiler/scheduler.hpp b/ethosu/regor/compiler/scheduler.hpp
index d327003d..eb32f60f 100644
--- a/ethosu/regor/compiler/scheduler.hpp
+++ b/ethosu/regor/compiler/scheduler.hpp
@@ -73,6 +73,8 @@ struct WeightScaleEncoding
 {
     std::unique_ptr<ArchitectureOpConfig> blockConfig;
     WeightScaleTensors weightScales;
+    // Keep track of op cycles - used in ChooseBestWeightFormat
+    CycleCost cycleCost;
 };
 
 struct SchedulerBufferTensor
@@ -349,9 +351,8 @@ private:
 
     void CoalesceWeightBufferTensors(Schedule *schedule);
 
-    CycleCost EstimateOpPerformance(SchedulerOperation *op, ArchitectureOpConfig *config, int ofm_depth);
-
-    CycleCost EstimateOpPerformanceForSparsity(SchedulerOperation *op, ArchitectureOpConfig *config, int ofm_depth);
+    CycleCost EstimateOpPerformance(SchedulerOperation *op, ArchitectureOpConfig *config, int ofm_depth,
+        WeightFormat wgtFormat = WeightFormat::Default);
 
     ElementAccess EstimateOpElementAccess(SchedulerOperation *op, ArchitectureOpConfig *config, int ofm_depth);
 
-- 
GitLab