From bcb9b7e82fde71574359e121e2f35bc01bc94de0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20Alfv=C3=A9n?= Date: Tue, 15 Apr 2025 13:30:39 +0200 Subject: [PATCH] MLBEDSW-10632: [MLCE] Improve weight format selection using op perf MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Weight format selection now considers op cycle cost, making Sparse2_4 more likely to be chosen when it yields real performance gains. - Modified part kernel branch in EstimateConvCycles to account for Sparse2_4. - Updated EstimateOpPerformance to include weight format. - Removed EstimateOpPerformanceForSparsity, as it is no longer needed. Signed-off-by: Johan Alfvén Change-Id: Iaa804567bd7f95895ae5ea9524c35479a3854aef --- ethosu/regor/architecture/architecture.hpp | 1 - .../ethosu55/ethos_u55_performance.cpp | 5 --- .../ethosu55/ethos_u55_performance.hpp | 1 - .../ethosu85/ethos_u85_performance.cpp | 10 +----- .../ethosu85/ethos_u85_performance.hpp | 1 - ethosu/regor/compiler/scheduler.cpp | 35 +++++++++---------- ethosu/regor/compiler/scheduler.hpp | 7 ++-- 7 files changed, 21 insertions(+), 39 deletions(-) diff --git a/ethosu/regor/architecture/architecture.hpp b/ethosu/regor/architecture/architecture.hpp index 02a6b7e2..39654377 100644 --- a/ethosu/regor/architecture/architecture.hpp +++ b/ethosu/regor/architecture/architecture.hpp @@ -305,7 +305,6 @@ class ArchitecturePerformance public: virtual ~ArchitecturePerformance() = default; virtual CycleCost MeasureCycleCost(const PerformanceQuery &query, const std::vector &fused) = 0; - virtual CycleCost MeasureCycleCostForSparsity(const PerformanceQuery &query, const std::vector &fused) = 0; virtual int64_t MemToMemCycles(const ArchitectureMemory *dest, const ArchitectureMemory *source, int sizeBytes) = 0; virtual ElementAccess MeasureElementAccess(const PerformanceQuery &query) = 0; virtual ElementAccess ElementTransferToBytes(const PerformanceQuery &query, const ElementAccess &access) = 0; diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp index 26c83483..9605508a 100644 --- a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp @@ -49,11 +49,6 @@ EthosU55Performance::EthosU55Performance(ArchEthosU55 *arch, const EthosU55PerfI _perfInfo = perfInfo; } -CycleCost EthosU55Performance::MeasureCycleCostForSparsity(const PerformanceQuery &query, const std::vector &fused) -{ - return MeasureCycleCost(query, fused); -} - CycleCost EthosU55Performance::MeasureCycleCost(const PerformanceQuery &query, const std::vector &fused) { CycleCost cycles; diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp index 6d50c1d0..9eba2bbc 100644 --- a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp @@ -67,7 +67,6 @@ public: public: CycleCost MeasureCycleCost(const PerformanceQuery &query, const std::vector &fused) override; - CycleCost MeasureCycleCostForSparsity(const PerformanceQuery &query, const std::vector &fused) override; int64_t MemToMemCycles(const ArchitectureMemory *dest, const ArchitectureMemory *source, int sizeBytes) override; ElementAccess MeasureElementAccess(const PerformanceQuery &query) override; ElementAccess ElementTransferToBytes(const PerformanceQuery &query, const ElementAccess &access) override; diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp index c4f80795..c1007272 100644 --- a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp +++ b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp @@ -52,15 +52,6 @@ EthosU85Performance::EthosU85Performance(ArchEthosU85 *arch, const EthosU85PerfI _perfInfo = perfInfo; } -CycleCost EthosU85Performance::MeasureCycleCostForSparsity(const PerformanceQuery &query, const std::vector &) -{ - // Temporary until we have a better performance estimate for U85 - CycleCost cycles; - EthosU85OpConfig *opConfig = static_cast(query.config); - cycles.opCycles = opConfig->OptimalDepthGranule(); // Maybe can find a better metric? MLBEDSW-9227 - return cycles; -} - CycleCost EthosU85Performance::MeasureCycleCost(const PerformanceQuery &query, const std::vector &fused) { CycleCost cycles; @@ -211,6 +202,7 @@ EthosU85Cycles EthosU85Performance::EstimateConvCycles(const PerformanceQuery &q numKernelSteps = DivRoundUp(subKernelElements, divider); cycles = std::max(cyclesWb, 4 * numUBlocks.ElementsWH()) * numKernelSteps * numUBlocks.Depth() * DivRoundUp(ifmBlock.Depth(), 8); + cycles /= query.weightFormat & WeightFormat::Sparse2_4 ? 2 : 1; } // Calculate delay diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp index 72cdd42f..5831023a 100644 --- a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp +++ b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp @@ -67,7 +67,6 @@ public: public: CycleCost MeasureCycleCost(const PerformanceQuery &query, const std::vector &fused) override; - CycleCost MeasureCycleCostForSparsity(const PerformanceQuery &query, const std::vector &fused) override; int64_t MemToMemCycles(const ArchitectureMemory *dest, const ArchitectureMemory *source, int sizeBytes) override; ElementAccess MeasureElementAccess(const PerformanceQuery &query) override; ElementAccess ElementTransferToBytes(const PerformanceQuery &query, const ElementAccess &access) override; diff --git a/ethosu/regor/compiler/scheduler.cpp b/ethosu/regor/compiler/scheduler.cpp index 3a8722d3..114254d4 100644 --- a/ethosu/regor/compiler/scheduler.cpp +++ b/ethosu/regor/compiler/scheduler.cpp @@ -502,12 +502,14 @@ WeightScaleEncoding ChooseBestWeightFormat(Architecture *arch, SchedulerOperatio weightStats.zeroCount = weightTensor->zeroCount; weightStats.distinctWeights = weightTensor->distinctWeights; auto query = Scheduler::InitPerfQuery(op, nullptr); - auto cycles = arch->Performance()->WeightDecodeCycles( - query, weightStats, weightTensor->config->Format(), weightTensor->memArea.memory); - if ( cycles < minCycles ) + auto totalCycles = + arch->Performance()->WeightDecodeCycles( + query, weightStats, weightTensor->config->Format(), weightTensor->memArea.memory) + + encodingResult.cycleCost.opCycles; + if ( totalCycles < minCycles ) { bestResult = &encodingResult; - minCycles = cycles; + minCycles = totalCycles; } } } @@ -563,11 +565,13 @@ WeightScaleEncoding Scheduler::EncodeBestWeightFormat( _arch, op, ifmShape, ifm2Shape, ofmShape, WF(WeightFormat::Default)); std::unique_ptr blockConfigSparse = MaybeGetSparsityConfig(_arch, op, ifmShape, ifm2Shape, ofmShape, supportedFormats); + CycleCost defaultCycleCost; + CycleCost sparseCycleCost; if ( blockConfigSparse ) { - auto perfDefault = EstimateOpPerformanceForSparsity(op, blockConfigDefault.get(), op->OFM()->SliceShape().Depth()); - auto perfSparse = EstimateOpPerformanceForSparsity(op, blockConfigSparse.get(), op->OFM()->SliceShape().Depth()); - if ( perfSparse.opCycles > perfDefault.opCycles ) + defaultCycleCost = EstimateOpPerformance(op, blockConfigDefault.get(), op->OFM()->SliceShape().Depth()); + sparseCycleCost = EstimateOpPerformance(op, blockConfigSparse.get(), op->OFM()->SliceShape().Depth(), WeightFormat::Sparse2_4); + if ( sparseCycleCost.opCycles > defaultCycleCost.opCycles ) { supportedFormats.Unset(WeightFormat::Sparse2_4); } @@ -614,6 +618,8 @@ WeightScaleEncoding Scheduler::EncodeBestWeightFormat( { supportedFormats.Unset(WeightFormat::Fast); } + // Sparse2_4 affects opCycles and must be accounted for when selecting the best weight format + encoding.cycleCost = (weightFormat % WeightFormat::Sparse2_4) ? sparseCycleCost : defaultCycleCost; encodingResults.emplace_back(std::move(encoding)); } catch ( const WeightEncodeException & ) @@ -629,6 +635,7 @@ WeightScaleEncoding Scheduler::EncodeBestWeightFormat( auto bestEncoding = ChooseBestWeightFormat(_arch, op, _options.optimizationStrategy, encodingResults); bestEncoding.blockConfig = (bestEncoding.weightScales.npuWeightsTensor->config->Format() % WeightFormat::Sparse2_4) ? std::move(blockConfigSparse) : std::move(blockConfigDefault); + return bestEncoding; } @@ -1742,7 +1749,7 @@ std::vector Scheduler::InitFusionQuery(SchedulerOperation *op) } -CycleCost Scheduler::EstimateOpPerformance(SchedulerOperation *op, ArchitectureOpConfig *config, int ofm_depth) +CycleCost Scheduler::EstimateOpPerformance(SchedulerOperation *op, ArchitectureOpConfig *config, int ofm_depth, WeightFormat wgtFormat) { CycleCost cycleCost; if ( !op->IsNpuOp() ) @@ -1751,22 +1758,12 @@ CycleCost Scheduler::EstimateOpPerformance(SchedulerOperation *op, ArchitectureO return cycleCost; } - PerformanceQuery query = InitPerfQuery(op, config, ofm_depth); + PerformanceQuery query = InitPerfQuery(op, config, ofm_depth, wgtFormat); std::vector fused = InitFusionQuery(op); cycleCost = _arch->Performance()->MeasureCycleCost(query, fused); return cycleCost; } -CycleCost Scheduler::EstimateOpPerformanceForSparsity(SchedulerOperation *op, ArchitectureOpConfig *config, int ofm_depth) -{ - CycleCost cycleCost; - assert(op->IsNpuOp()); - - PerformanceQuery query = InitPerfQuery(op, config, ofm_depth); - std::vector fused = InitFusionQuery(op); - cycleCost = _arch->Performance()->MeasureCycleCostForSparsity(query, fused); - return cycleCost; -} ElementAccess Scheduler::EstimateOpElementAccess(SchedulerOperation *op, ArchitectureOpConfig *config, int ofm_depth) { diff --git a/ethosu/regor/compiler/scheduler.hpp b/ethosu/regor/compiler/scheduler.hpp index d327003d..eb32f60f 100644 --- a/ethosu/regor/compiler/scheduler.hpp +++ b/ethosu/regor/compiler/scheduler.hpp @@ -73,6 +73,8 @@ struct WeightScaleEncoding { std::unique_ptr blockConfig; WeightScaleTensors weightScales; + // Keep track of op cycles - used in ChooseBestWeightFormat + CycleCost cycleCost; }; struct SchedulerBufferTensor @@ -349,9 +351,8 @@ private: void CoalesceWeightBufferTensors(Schedule *schedule); - CycleCost EstimateOpPerformance(SchedulerOperation *op, ArchitectureOpConfig *config, int ofm_depth); - - CycleCost EstimateOpPerformanceForSparsity(SchedulerOperation *op, ArchitectureOpConfig *config, int ofm_depth); + CycleCost EstimateOpPerformance(SchedulerOperation *op, ArchitectureOpConfig *config, int ofm_depth, + WeightFormat wgtFormat = WeightFormat::Default); ElementAccess EstimateOpElementAccess(SchedulerOperation *op, ArchitectureOpConfig *config, int ofm_depth); -- GitLab