From 163f4e86f5b8cc9d7b1b6ee91907a13ff96fd2de Mon Sep 17 00:00:00 2001 From: William Isaksson Date: Thu, 19 Dec 2024 16:19:42 +0100 Subject: [PATCH] MLBEDSW-10091: Add more performance debug info to database Adds more performance debug info to the debug database. Change-Id: Ibaa6b40b8d0d8566af66746257e17257787b599c Signed-off-by: William Isaksson --- ethosu/regor/CMakeLists.txt | 1 + ethosu/regor/architecture/architecture.cpp | 7 + ethosu/regor/architecture/architecture.hpp | 3 + .../ethosu55/ethos_u55_performance.cpp | 121 +++++++-- .../ethosu55/ethos_u55_performance.hpp | 25 +- .../ethosu85/ethos_u85_performance.cpp | 113 ++++++++- .../ethosu85/ethos_u85_performance.hpp | 25 +- ethosu/regor/compiler/network_performance.cpp | 234 +++++++++++++++++- ethosu/regor/compiler/network_performance.hpp | 7 +- ethosu/regor/compiler/scheduler_operation.cpp | 29 +++ ethosu/regor/compiler/shape_util.hpp | 13 + 11 files changed, 531 insertions(+), 47 deletions(-) create mode 100644 ethosu/regor/compiler/scheduler_operation.cpp diff --git a/ethosu/regor/CMakeLists.txt b/ethosu/regor/CMakeLists.txt index 0810b5be..357ec732 100644 --- a/ethosu/regor/CMakeLists.txt +++ b/ethosu/regor/CMakeLists.txt @@ -284,6 +284,7 @@ regor_lib( "compiler/scheduler.cpp" "compiler/scheduler_decompose.cpp" "compiler/scheduler_packing.cpp" + "compiler/scheduler_operation.cpp" "compiler/softmax.cpp" "compiler/tensor.cpp" "compiler/tensor_allocator.cpp" diff --git a/ethosu/regor/architecture/architecture.cpp b/ethosu/regor/architecture/architecture.cpp index ae85546a..8ab2ca3c 100644 --- a/ethosu/regor/architecture/architecture.cpp +++ b/ethosu/regor/architecture/architecture.cpp @@ -30,6 +30,13 @@ BEGIN_ENUM_TABLE(regor::MemUsage) ADD_ENUM_NAME(Staging) END_ENUM_TABLE() +BEGIN_ENUM_TABLE(regor::TensorFormat) + ADD_ENUM_NAME(Unknown) + ADD_ENUM_NAME(NHWC) + ADD_ENUM_NAME(NHCWB16) + ADD_ENUM_NAME(WeightsEncoded) +END_ENUM_TABLE() + namespace regor { diff --git a/ethosu/regor/architecture/architecture.hpp b/ethosu/regor/architecture/architecture.hpp index 0805c029..1b6a8a1b 100644 --- a/ethosu/regor/architecture/architecture.hpp +++ b/ethosu/regor/architecture/architecture.hpp @@ -28,6 +28,7 @@ #include "common/scaling.hpp" #include "common/shape.hpp" #include "common/transpose_type.hpp" +#include "compiler/database.hpp" #include "compiler/kernel.hpp" #include "compiler/op_type.hpp" #include "compiler/tensor_properties.hpp" @@ -305,6 +306,8 @@ public: virtual int64_t WeightDecodeCycles(const PerformanceQuery &query, const WeightStats &weights, Flags format, ArchitectureMemory *weightsMemory) = 0; virtual float ChannelBW(const ArchitectureMemory *mem, MemChannel channel) = 0; + virtual void InitDatabase(Database *db) = 0; + virtual void RecordToDB(int opId) = 0; }; enum class IniParseResult diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp index eaebcba3..be5a3b90 100644 --- a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp @@ -21,6 +21,7 @@ #include "common/common.hpp" #include "architecture/architecture.hpp" +#include "compiler/shape_util.hpp" #include "ethos_u55.hpp" namespace regor @@ -56,7 +57,10 @@ CycleCost EthosU55Performance::MeasureCycleCostForSparsity(const PerformanceQuer CycleCost EthosU55Performance::MeasureCycleCost(const PerformanceQuery &query, const std::vector &fused) { CycleCost cycles; + EthosU55Cycles cycleComponents = {}; auto npuOp = _arch->GetHWOp(query.type); + const bool recordToDb = _db && _nextId != -1; + // Convolution/Vector product cycle calculation if ( OpUsesMacs(npuOp) ) @@ -70,14 +74,21 @@ CycleCost EthosU55Performance::MeasureCycleCost(const PerformanceQuery &query, c cycles.macs = int64_t(query.kernel->ElementsWH()) * query.ofmShape.Elements() * query.ifmShape[0].Depth(); } - cycles.opCycles = EstimateConvCycles(query, fused); + cycleComponents = EstimateConvCycles(query, fused); + cycles.opCycles = cycleComponents.cycles; } // Elementwise cycle calculation else if ( npuOp == EthosU55NpuOp::Elementwise ) { + auto [totCCPerElem, aoCCPerElem, cmdCCPerElem] = EstimateOutputCyclesPerElement(query, fused); auto ofmShape = (query.ofmFormat == TensorFormat::NHCWB16) ? Shape::RoundAway(query.ofmShape, Shape(1, 1, 1, 16)) : query.ofmShape; - cycles.opCycles = int64_t(EstimateOutputCyclesPerElement(query, fused) * float(ofmShape.Elements())); + cycles.opCycles = int64_t(totCCPerElem * float(ofmShape.Elements())); + if ( recordToDb ) + { + cycleComponents.aoCycles = int64_t(aoCCPerElem * float(ofmShape.Elements())); + cycleComponents.cmdCycles = int64_t(cmdCCPerElem * float(ofmShape.Elements())); + } } else if ( npuOp == EthosU55NpuOp::Dma ) { @@ -95,6 +106,32 @@ CycleCost EthosU55Performance::MeasureCycleCost(const PerformanceQuery &query, c assert(false && "Unknown operator cycle costing"); } + if ( recordToDb ) + { + assert(_mainTable != -1); + EthosU55OpConfig *opConfig = static_cast(query.config); + + std::vector row = { + OpUsesMacs(npuOp) ? std::to_string(cycleComponents.macCycles) : "", + std::to_string(cycleComponents.aoCycles), + std::to_string(cycleComponents.cmdCycles), + opConfig ? EnumToString(opConfig->Traversal()) : "", + }; + + auto shapeToStrings = [&row](const std::vector &shape) + { + std::transform(shape.begin(), shape.end(), std::back_inserter(row), + [](int n) -> std::string { return n ? std::to_string(n) : ""; }); + }; + + shapeToStrings(ReshapeToNHWC(opConfig ? opConfig->IfmBlock() : Shape()).ToList()); + shapeToStrings(ReshapeToNHWC(opConfig ? opConfig->OfmBlock() : Shape()).ToList()); + + _db->AddRow(_mainTable, _nextId, std::move(row)); + _nextId = -1; + } + + return cycles; } @@ -107,7 +144,7 @@ int64_t EthosU55Performance::MemToMemCycles(const ArchitectureMemory *dest, cons return std::max(fromCycles, toCycles); } -int64_t EthosU55Performance::EstimateConvCycles(const PerformanceQuery &query, const std::vector &fused) +EthosU55Cycles EthosU55Performance::EstimateConvCycles(const PerformanceQuery &query, const std::vector &fused) { EthosU55OpConfig *opConfig = static_cast(query.config); auto npuOp = _arch->GetHWOp(query.type); @@ -237,7 +274,11 @@ int64_t EthosU55Performance::EstimateConvCycles(const PerformanceQuery &query, c // Estimate output cycles int numOfmBlks = Shape::DivRoundUp(query.ofmShape, ofmBlock).Elements(); - int64_t cyclesOutputBlk = int64_t(EstimateOutputCyclesPerElement(query, fused) * float(ofmBlock.Elements())); + auto [totCCPerElem, aoCCPerElem, cmdCCPerElem] = EstimateOutputCyclesPerElement(query, fused); + auto aoCycles = int64_t(aoCCPerElem * float(ofmBlock.Elements())); + auto cmdCycles = int64_t(cmdCCPerElem * float(ofmBlock.Elements())); + auto cyclesOutputBlk = int64_t(totCCPerElem * float(ofmBlock.Elements())); + // Scale and bias tensor if ( query.constShape.Size() > 0 && query.constShape.Depth() > 0 ) @@ -246,23 +287,28 @@ int64_t EthosU55Performance::EstimateConvCycles(const PerformanceQuery &query, c cyclesOutputBlk = std::max(cyclesOutputBlk, int64_t(cyclesBiasBlk)); } - int64_t cycles_cmd = EstimateMinimumMemoryCycles(query); - cycles_cmd = (cycles_cmd + cyclesOutputBlk + cyclesDpuBlk) / 4; // Per DPU + int64_t cmdCycles2 = EstimateMinimumMemoryCycles(query); + cmdCycles2 = (cmdCycles2 + cyclesOutputBlk + cyclesDpuBlk) / 4; // Per DPU + + int64_t cyclesAO = aoCycles * numOfmBlks + cyclesDpuBlk; + int64_t cyclesDpu = cyclesDpuBlk * numOfmBlks + cyclesOutputBlk; - cyclesDpuBlk = std::max(cyclesDpuBlk, cycles_cmd); - cyclesOutputBlk = std::max(cyclesOutputBlk, cycles_cmd); + cmdCycles = std::max(cmdCycles, cmdCycles2); + cyclesDpuBlk = std::max(cyclesDpuBlk, cmdCycles2); + cyclesOutputBlk = std::max(cyclesOutputBlk, cmdCycles2); int64_t totalCycles = 0; if ( cyclesDpuBlk > cyclesOutputBlk ) { - totalCycles = cyclesDpuBlk * numOfmBlks + cyclesOutputBlk; + totalCycles = int64_t(cyclesDpuBlk * numOfmBlks) + cyclesOutputBlk; } else { - totalCycles = cyclesOutputBlk * numOfmBlks + cyclesDpuBlk; + totalCycles = int64_t(cyclesOutputBlk * numOfmBlks) + cyclesDpuBlk; + cmdCycles = cmdCycles * numOfmBlks + cyclesDpuBlk; } - return totalCycles; + return {totalCycles, cyclesDpu, cyclesAO, cmdCycles}; } static int EstimateMemoryTransfer(int cores, bool isRead, ArchitectureMemory *memory, TensorFormat format, @@ -350,7 +396,7 @@ int64_t EthosU55Performance::EstimateMinimumMemoryCycles(const PerformanceQuery } -float EthosU55Performance::EstimateOutputCyclesPerElement(const PerformanceQuery &query, const std::vector &fused) +EthosU55ElementCycles EthosU55Performance::EstimateOutputCyclesPerElement(const PerformanceQuery &query, const std::vector &fused) { EthosU55OpConfig *opConfig = static_cast(query.config); auto npuOp = _arch->GetHWOp(query.type); @@ -413,16 +459,18 @@ float EthosU55Performance::EstimateOutputCyclesPerElement(const PerformanceQuery } float cyclesPerElement = std::max(_perfInfo->outputCycles[outputPerfIndex], _perfInfo->activationCycles[activationPerfIndex]); - + float cycleCmd = 0; + float aoCyclesPerElement = cyclesPerElement; if ( npuOp == EthosU55NpuOp::Elementwise ) { int numElemsBlk = opConfig->OfmBlock().Elements(); assert(numElemsBlk > 0); - float cycleCmd = (float(EstimateMinimumMemoryCycles(query)) / float(numElemsBlk) + cyclesPerElement) / 4.0f; // per DPU + cycleCmd = (float(EstimateMinimumMemoryCycles(query)) / float(numElemsBlk) + cyclesPerElement) / 4.0f; // per + // DPU cyclesPerElement = std::max(cyclesPerElement, cycleCmd); } - return cyclesPerElement; + return {cyclesPerElement, aoCyclesPerElement, cycleCmd}; } ElementAccess EthosU55Performance::MeasureElementAccess(const PerformanceQuery &query) @@ -558,8 +606,14 @@ ElementAccess EthosU55Performance::ElementTransferToBytes(const PerformanceQuery } int64_t EthosU55Performance::WeightDecodeCycles( - const PerformanceQuery &, const WeightStats &weights, Flags, ArchitectureMemory *weightsMemory) + const PerformanceQuery &, const WeightStats &weights, Flags format, ArchitectureMemory *weightsMemory) { + if ( _db && _nextId != -1 ) + { + assert(_wdTable != -1); + _db->AddRow(_wdTable, _nextId, {""}); + _nextId = -1; + } int64_t dmaCycles = int64_t(float(weights.encodedSize) / weightsMemory->Bandwidth()); dmaCycles += weightsMemory->ReadLatency(); return dmaCycles; @@ -570,4 +624,39 @@ float EthosU55Performance::ChannelBW(const ArchitectureMemory *mem, const MemCha UNUSED(channel); return mem->Bandwidth(); } + +void EthosU55Performance::InitDatabase(Database *optDB) +{ + _db = optDB; + _mainTable = _db->AddTable("perf_debug_main"); + _wdTable = _db->AddTable("perf_debug_wd"); + + std::vector columns = { + "mac_cycles", + "ao_cycles", + "cmd_cycles", + "traversal", + }; + + std::vector shapes = {"ifm_block", "ofm_block"}; + + for ( auto &shape : shapes ) + { + columns.push_back(shape + "_n"); + columns.push_back(shape + "_h"); + columns.push_back(shape + "_w"); + columns.push_back(shape + "_c"); + } + _db->AddColumns(_mainTable, std::move(columns)); + _db->AddColumns(_wdTable, {"wd_cycles"}); +} + +void EthosU55Performance::RecordToDB(int opId) +{ + if ( _db ) + { + _nextId = opId; + } +} + } // namespace regor diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp index e9839b07..c3b8d2a5 100644 --- a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp @@ -33,6 +33,21 @@ struct EthosU55PerfInfo float activationCycles[3]; }; +struct EthosU55Cycles +{ + int64_t cycles; + int64_t macCycles; + int64_t aoCycles; + int64_t cmdCycles; +}; + +struct EthosU55ElementCycles +{ + float cycles; + float aoCycles; + float cmdCycles; +}; + /// /// Profiles performance analysis for Ethos-U55 /// @@ -41,6 +56,10 @@ class EthosU55Performance : public ArchitecturePerformance protected: ArchEthosU55 *_arch; const EthosU55PerfInfo *_perfInfo; + Database *_db = nullptr; + int _nextId = -1; + int _mainTable = -1; + int _wdTable = -1; public: EthosU55Performance(ArchEthosU55 *arch, const EthosU55PerfInfo *perfInfo); @@ -54,10 +73,12 @@ public: int64_t WeightDecodeCycles(const PerformanceQuery &query, const WeightStats &weights, Flags format, ArchitectureMemory *weightsMemory) override; float ChannelBW(const ArchitectureMemory *mem, MemChannel channel) override; + void InitDatabase(Database *optDB) override; + void RecordToDB(int opId) override; private: - int64_t EstimateConvCycles(const PerformanceQuery &query, const std::vector &fused); - float EstimateOutputCyclesPerElement(const PerformanceQuery &query, const std::vector &fused); + EthosU55Cycles EstimateConvCycles(const PerformanceQuery &query, const std::vector &fused); + EthosU55ElementCycles EstimateOutputCyclesPerElement(const PerformanceQuery &query, const std::vector &fused); int64_t EstimateMinimumMemoryCycles(const PerformanceQuery &query); }; diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp index db4aa585..a6785cf1 100644 --- a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp +++ b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp @@ -22,6 +22,7 @@ #include "common/logging.hpp" #include "architecture/architecture.hpp" +#include "compiler/shape_util.hpp" #include "ethos_u85.hpp" namespace regor @@ -63,8 +64,11 @@ CycleCost EthosU85Performance::MeasureCycleCostForSparsity(const PerformanceQuer CycleCost EthosU85Performance::MeasureCycleCost(const PerformanceQuery &query, const std::vector &fused) { CycleCost cycles; + EthosU85Cycles cycleComponents; + auto npuOp = _arch->GetHWOp(query.type); const bool sparse = query.weightFormat & WeightFormat::Sparse2_4; + const bool recordToDb = _db && _nextId != -1; // Convolution/Vector product cycle calculation if ( OpUsesMacs(npuOp) ) { @@ -79,14 +83,21 @@ CycleCost EthosU85Performance::MeasureCycleCost(const PerformanceQuery &query, c } cycles.macs /= sparse ? 2 : 1; - cycles.opCycles = EstimateConvCycles(query, fused); + cycleComponents = EstimateConvCycles(query, fused); + cycles.opCycles = cycleComponents.cycles; } // Elementwise cycle calculation else if ( npuOp == EthosU85NpuOp::Elementwise ) { + auto [totCCPerElem, aoCCPerElem, cmdCCPerElem] = EstimateOutputCyclesPerElement(query, fused); auto ofmShape = (query.ofmFormat == TensorFormat::NHCWB16) ? Shape::RoundAway(query.ofmShape, Shape(1, 1, 1, 16)) : query.ofmShape; - cycles.opCycles = int64_t(EstimateOutputCyclesPerElement(query, fused) * float(ofmShape.Elements())); + cycles.opCycles = int64_t(totCCPerElem * float(ofmShape.Elements())); + if ( recordToDb ) + { + cycleComponents.aoCycles = int64_t(aoCCPerElem * float(ofmShape.Elements())); + cycleComponents.cmdCycles = int64_t(cmdCCPerElem * float(ofmShape.Elements())); + } } // Resize cycle calculation else if ( npuOp == EthosU85NpuOp::Resize ) @@ -108,6 +119,32 @@ CycleCost EthosU85Performance::MeasureCycleCost(const PerformanceQuery &query, c assert(false && "Unknown operator cycle costing"); } + if ( recordToDb ) + { + assert(_mainTable != -1); + EthosU85OpConfig *opConfig = static_cast(query.config); + + std::vector row = { + OpUsesMacs(npuOp) ? std::to_string(cycleComponents.macCycles) : "", + std::to_string(cycleComponents.aoCycles), + std::to_string(cycleComponents.cmdCycles), + opConfig ? EnumToString(opConfig->Traversal()) : "", + }; + auto shapeToStrings = [&row](const std::vector &shape) + { + std::transform(shape.begin(), shape.end(), std::back_inserter(row), + [](int n) -> std::string { return n ? std::to_string(n) : ""; }); + }; + + + shapeToStrings(ReshapeToNHWC(opConfig ? opConfig->IfmBlock() : Shape()).ToList()); + shapeToStrings(ReshapeToNHWC(opConfig ? opConfig->OfmBlock() : Shape()).ToList()); + shapeToStrings(ReshapeToNHWC(opConfig ? opConfig->OfmUBlock() : Shape()).ToList()); + + _db->AddRow(_mainTable, _nextId, std::move(row)); + _nextId = -1; + } + return cycles; } @@ -121,7 +158,7 @@ int64_t EthosU85Performance::MemToMemCycles(const ArchitectureMemory *dest, cons return std::max(fromCycles, toCycles); } -int64_t EthosU85Performance::EstimateConvCycles(const PerformanceQuery &query, const std::vector &fused) +EthosU85Cycles EthosU85Performance::EstimateConvCycles(const PerformanceQuery &query, const std::vector &fused) { EthosU85OpConfig *opConfig = static_cast(query.config); auto npuOp = _arch->GetHWOp(query.type); @@ -228,7 +265,10 @@ int64_t EthosU85Performance::EstimateConvCycles(const PerformanceQuery &query, c { numOfmBlks *= std::max(static_cast(query.ofmShape[i]) / ofmBlock[i], 1.0f); } - int64_t cyclesOutputBlk = int64_t(EstimateOutputCyclesPerElement(query, fused) * float(ofmBlock.Elements())); + auto [totCCPerElem, aoCCPerElem, cmdCCPerElem] = EstimateOutputCyclesPerElement(query, fused); + auto aoCycles = int64_t(aoCCPerElem * float(ofmBlock.Elements())); + auto cmdCycles = int64_t(cmdCCPerElem * float(ofmBlock.Elements())); + auto cyclesOutputBlk = int64_t(totCCPerElem * float(ofmBlock.Elements())); // Scale and bias tensor if ( query.constShape.Size() > 0 && query.constShape.Depth() > 0 ) @@ -237,11 +277,15 @@ int64_t EthosU85Performance::EstimateConvCycles(const PerformanceQuery &query, c cyclesOutputBlk = std::max(cyclesOutputBlk, int64_t(cyclesBiasBlk)); } - int64_t cycles_cmd = EstimateMinimumMemoryCycles(query); - cycles_cmd = (cycles_cmd + cyclesOutputBlk + cyclesDpuBlk) / 4; // Per DPU + int64_t cmdCycles2 = EstimateMinimumMemoryCycles(query); + cmdCycles2 = (cmdCycles2 + cyclesOutputBlk + cyclesDpuBlk) / 4; // Per DPU + + int64_t cyclesAO = aoCycles * numOfmBlks + cyclesDpuBlk; + int64_t cyclesDpu = cyclesDpuBlk * numOfmBlks + cyclesOutputBlk; - cyclesDpuBlk = std::max(cyclesDpuBlk, cycles_cmd); - cyclesOutputBlk = std::max(cyclesOutputBlk, cycles_cmd); + cmdCycles = std::max(cmdCycles, cmdCycles2); + cyclesDpuBlk = std::max(cyclesDpuBlk, cmdCycles2); + cyclesOutputBlk = std::max(cyclesOutputBlk, cmdCycles2); int64_t totalCycles = 0; if ( cyclesDpuBlk > cyclesOutputBlk ) @@ -251,9 +295,10 @@ int64_t EthosU85Performance::EstimateConvCycles(const PerformanceQuery &query, c else { totalCycles = int64_t(cyclesOutputBlk * numOfmBlks) + cyclesDpuBlk; + cmdCycles = cmdCycles * numOfmBlks + cyclesDpuBlk; } - return totalCycles; + return {totalCycles, cyclesDpu, cyclesAO, cmdCycles}; } static int64_t EstimateMemoryTransfer(int cores, bool isRead, ArchitectureMemory *memory, TensorFormat format, @@ -342,7 +387,7 @@ int64_t EthosU85Performance::EstimateMinimumMemoryCycles(const PerformanceQuery } -float EthosU85Performance::EstimateOutputCyclesPerElement(const PerformanceQuery &query, const std::vector &fused) +EthosU85ElementCycles EthosU85Performance::EstimateOutputCyclesPerElement(const PerformanceQuery &query, const std::vector &fused) { EthosU85OpConfig *opConfig = static_cast(query.config); auto npuOp = _arch->GetHWOp(query.type); @@ -379,16 +424,18 @@ float EthosU85Performance::EstimateOutputCyclesPerElement(const PerformanceQuery } float cyclesPerElement = std::max(_perfInfo->outputCycles[outputPerfIndex], _perfInfo->activationCycles[activationPerfIndex]); - + float cycleCmd = 0; + float aoCyclesPerElement = cyclesPerElement; if ( npuOp == EthosU85NpuOp::Elementwise ) { int numElemsBlk = opConfig->OfmBlock().Elements(); assert(numElemsBlk > 0); - float cycleCmd = (float(EstimateMinimumMemoryCycles(query)) / float(numElemsBlk) + cyclesPerElement) / 4.0f; // per DPU + cycleCmd = (float(EstimateMinimumMemoryCycles(query)) / float(numElemsBlk) + cyclesPerElement) / 4.0f; // per + // DPU cyclesPerElement = std::max(cyclesPerElement, cycleCmd); } - return cyclesPerElement; + return {cyclesPerElement, aoCyclesPerElement, cycleCmd}; } ElementAccess EthosU85Performance::MeasureElementAccess(const PerformanceQuery &query) @@ -563,6 +610,12 @@ int64_t EthosU85Performance::WeightDecodeCycles( weightsPerCycle = weightsPerCore * _arch->_cores; } int64_t decodeCycles = weights.size / weightsPerCycle; + if ( _db && _nextId != -1 ) + { + assert(_wdTable != -1); + _db->AddRow(_wdTable, _nextId, {std::to_string(decodeCycles)}); + _nextId = -1; + } MemChannel channel = (format & WeightFormat::Fast) ? MemChannel::FastWeight : MemChannel::Weight; int64_t dmaCycles = int64_t(float(weights.encodedSize) / ChannelBW(weightsMemory, channel)); @@ -598,4 +651,38 @@ float EthosU85Performance::ChannelBW(const ArchitectureMemory *mem, const MemCha return channelBW; } +void EthosU85Performance::InitDatabase(Database *optDB) +{ + _db = optDB; + _mainTable = _db->AddTable("perf_debug_main"); + _wdTable = _db->AddTable("perf_debug_wd"); + + std::vector columns = { + "mac_cycles", + "ao_cycles", + "cmd_cycles", + "traversal", + }; + + std::vector shapes = {"ifm_block", "ofm_block", "ofm_ublock"}; + + for ( auto &shape : shapes ) + { + columns.push_back(shape + "_n"); + columns.push_back(shape + "_h"); + columns.push_back(shape + "_w"); + columns.push_back(shape + "_c"); + } + _db->AddColumns(_mainTable, std::move(columns)); + _db->AddColumns(_wdTable, {"wd_cycles"}); +} + +void EthosU85Performance::RecordToDB(int opId) +{ + if ( _db ) + { + _nextId = opId; + } +} + } // namespace regor diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp index 6c7faacd..a048e99e 100644 --- a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp +++ b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp @@ -33,6 +33,21 @@ struct EthosU85PerfInfo float activationCycles[3]; }; +struct EthosU85Cycles +{ + int64_t cycles = 0; + int64_t macCycles = 0; + int64_t aoCycles = 0; + int64_t cmdCycles = 0; +}; + +struct EthosU85ElementCycles +{ + float cycles; + float aoCycles; + float cmdCycles; +}; + /// /// Profiles performance analysis for Ethos-U85 /// @@ -41,6 +56,10 @@ class EthosU85Performance : public ArchitecturePerformance protected: ArchEthosU85 *_arch; const EthosU85PerfInfo *_perfInfo; + Database *_db = nullptr; + int _nextId = -1; + int _mainTable = -1; + int _wdTable = -1; public: EthosU85Performance(ArchEthosU85 *arch, const EthosU85PerfInfo *perfInfo); @@ -54,10 +73,12 @@ public: int64_t WeightDecodeCycles(const PerformanceQuery &query, const WeightStats &weights, Flags format, ArchitectureMemory *weightsMemory) override; float ChannelBW(const ArchitectureMemory *mem, MemChannel channel) override; + void InitDatabase(Database *optDB) override; + void RecordToDB(int opId) override; private: - int64_t EstimateConvCycles(const PerformanceQuery &query, const std::vector &fused); - float EstimateOutputCyclesPerElement(const PerformanceQuery &query, const std::vector &fused); + EthosU85Cycles EstimateConvCycles(const PerformanceQuery &query, const std::vector &fused); + EthosU85ElementCycles EstimateOutputCyclesPerElement(const PerformanceQuery &query, const std::vector &fused); int64_t EstimateMinimumMemoryCycles(const PerformanceQuery &query); }; diff --git a/ethosu/regor/compiler/network_performance.cpp b/ethosu/regor/compiler/network_performance.cpp index 12818c13..b046c862 100644 --- a/ethosu/regor/compiler/network_performance.cpp +++ b/ethosu/regor/compiler/network_performance.cpp @@ -20,6 +20,7 @@ #include "common/common.hpp" +#include "compiler/shape_util.hpp" #include "database.hpp" #include "graph_optimiser.hpp" @@ -50,13 +51,14 @@ PerformanceResult NetworkPerformance::Measure(Schedule *schedule, OptimiserDatab _arch->LUTMemory().memory, _arch->StagingMemory().memory}); std::unordered_set regions( {_arch->ReadonlyMemory(), _arch->FeatureMapMemory(), _arch->LUTMemory(), _arch->StagingMemory()}); - int opTable = 0; - int opTableColumnCount = 0; std::unordered_set tensorUids; + int opTable = 0; + int perfDebugTable = 0; if ( optDb ) { db = optDb->Get(); + _arch->Performance()->InitDatabase(db); opTable = db->AddTable("perf"); std::vector columns = { "source_id", @@ -74,7 +76,102 @@ PerformanceResult NetworkPerformance::Measure(Schedule *schedule, OptimiserDatab columns.push_back(label); } db->AddColumns(opTable, columns); - opTableColumnCount = int(columns.size()); + + perfDebugTable = db->AddTable("perf_debug"); + + columns = {}; + const std::vector shapeColumns = { + "ifm_shape", + "ifm2_shape", + "ofm_shape", + "ifm_slice", + "ifm2_slice", + "ofm_slice", + "ifm_stripe", + "ifm2_stripe", + "ofm_stripe", + }; + + for ( auto &shape : shapeColumns ) + { + columns.push_back(shape + "_n"); + columns.push_back(shape + "_h"); + columns.push_back(shape + "_w"); + columns.push_back(shape + "_c"); + } + + columns.insert(columns.end(), + { + "ifm_memory", + "ifm2_memory", + "ofm_memory", + "ifm_format", + "ifm2_format", + "ofm_format", + "ifm_dtype", + "ifm2_dtype", + "ofm_dtype", + "ifm_pre_buffering", + "ifm2_pre_buffering", + "ifm_buffering", + "ifm2_buffering", + "reverse_type", + "transpose_type", + "time_index", + "cascade", + "weight_format", + "weight_dtype", + "weight_total_bytes", + "weight_max_range_bytes", + "weight_sub_streams", + "weight_distinct", + "weight_zero", + "scales_dtype", + "scales_total_bytes", + "scales_max_range_bytes", + "ofm_depth_slices", + "weight_pre_buffer", + "weight_buffering", + "weight_transfer_cycles", + "kernel_depth_multiplier", + }); + + columns.emplace_back("kernel_padding_T"); + columns.emplace_back("kernel_padding_B"); + columns.emplace_back("kernel_padding_L"); + columns.emplace_back("kernel_padding_R"); + columns.emplace_back("kernel_padding_N"); + columns.emplace_back("kernel_padding_F"); + + const std::vector xyzColumns = { + "kernel_size", + "kernel_dilation", + "kernel_stride", + }; + for ( auto &xyzCol : xyzColumns ) + { + columns.push_back(xyzCol + "_x"); + columns.push_back(xyzCol + "_y"); + columns.push_back(xyzCol + "_z"); + } + + for ( const auto &mem : memories ) + { + columns.push_back(mem->Name() + EnumToString(AccessType::Lut) + "_ac"); + columns.push_back(mem->Name() + EnumToString(AccessType::Lut) + "_read"); + columns.push_back(mem->Name() + EnumToString(AccessType::Lut) + "_write"); + columns.push_back(mem->Name() + EnumToString(AccessType::FeatureMap) + "_ac"); + columns.push_back(mem->Name() + EnumToString(AccessType::FeatureMap) + "_read"); + columns.push_back(mem->Name() + EnumToString(AccessType::FeatureMap) + "_write"); + columns.push_back(mem->Name() + EnumToString(AccessType::Weights) + "_ac"); + columns.push_back(mem->Name() + EnumToString(AccessType::Weights) + "_read"); + columns.push_back(mem->Name() + EnumToString(AccessType::Weights) + "_write"); + columns.push_back(mem->Name() + EnumToString(AccessType::Scales) + "_ac"); + columns.push_back(mem->Name() + EnumToString(AccessType::Scales) + "_read"); + columns.push_back(mem->Name() + EnumToString(AccessType::Scales) + "_write"); + } + + db->AddColumns(perfDebugTable, std::move(columns)); } for ( auto const &schedOp : _ops ) @@ -97,7 +194,7 @@ PerformanceResult NetworkPerformance::Measure(Schedule *schedule, OptimiserDatab } if ( optDb != nullptr ) { - AddToDatabase(perf, schedOp.get(), opTable, opTableColumnCount, memories, optDb); + AddToDatabase(perf, schedOp.get(), cost, opTable, perfDebugTable, memories, optDb); } performance += perf; prevOp = schedOp.get(); @@ -109,7 +206,7 @@ PerformanceResult NetworkPerformance::Measure(Schedule *schedule, OptimiserDatab perf = ProcessOpPerformance(subOp.get(), cost, schedule, prevOp, prevCost, memories); if ( optDb != nullptr ) { - AddToDatabase(perf, subOp.get(), opTable, opTableColumnCount, memories, optDb); + AddToDatabase(perf, subOp.get(), cost, opTable, perfDebugTable, memories, optDb); } performance += perf; prevOp = subOp.get(); @@ -155,8 +252,8 @@ PerformanceResult NetworkPerformance::ProcessOpPerformance(SchedulerOperation *s } -void NetworkPerformance::AddToDatabase(const PerformanceResult &perf, SchedulerOperation *schedOp, int opTable, - int /*opTableColumnCount*/, const std::unordered_set &memories, OptimiserDatabase *optDb) +void NetworkPerformance::AddToDatabase(const PerformanceResult &perf, SchedulerOperation *schedOp, SchedulerOpInfo *cost, + int opTable, int perfDebugTable, const std::unordered_set &memories, OptimiserDatabase *optDb) { // Per-layer calculations assert(optDb != nullptr); @@ -188,7 +285,110 @@ void NetworkPerformance::AddToDatabase(const PerformanceResult &perf, SchedulerO row.push_back(std::to_string(perf.memory.at(mem).AccessCycles())); } - db->AddRow(opTable, schedOp->Index(), std::move(row)); + db->AddRow(opTable, schedOp->Uid(), std::move(row)); + + row = {}; + auto shapeToStrings = [&row](const std::vector &shape) + { + std::transform(shape.begin(), shape.end(), std::back_inserter(row), + [](int n) -> std::string { return n ? std::to_string(n) : ""; }); + }; + // clang-format off + // FM shapes + shapeToStrings(ReshapeToNHWC(schedOp->IFM(0)->shape).ToList()); + shapeToStrings(ReshapeToNHWC(schedOp->TryIFM(1) ? schedOp->IFM(1)->shape : Shape()).ToList()); + shapeToStrings(ReshapeToNHWC(schedOp->OFM()->shape).ToList()); + // Slice shapes + shapeToStrings(ReshapeToNHWC(schedOp->IFM(0)->slice.shape).ToList()); + shapeToStrings(ReshapeToNHWC(schedOp->TryIFM(1) ? schedOp->IFM(1)->slice.shape : Shape()).ToList()); + shapeToStrings(ReshapeToNHWC(schedOp->OFM()->slice.shape).ToList()); + // Stripe shapes + shapeToStrings(ReshapeToNHWC(cost->stripeInput[0]).ToList()); + shapeToStrings(ReshapeToNHWC(schedOp->TryIFM(1) ? cost->stripeInput[1] : Shape()).ToList()); + shapeToStrings(ReshapeToNHWC(cost->stripe).ToList()); + + row.insert(row.end(), { + // FM Memory + fmt::format("{}", schedOp->IFM(0)->tensor->memArea.memory->Name()), + fmt::format("{}", schedOp->TryIFM(1) ? schedOp->IFM(1)->tensor->memArea.memory->Name() : ""), + fmt::format("{}", schedOp->OFM()->tensor->memArea.memory->Name()), + // Formats + fmt::format("{}", EnumToString(schedOp->IFM(0)->tensor->format)), + fmt::format("{}", schedOp->TryIFM(1) ? EnumToString(schedOp->IFM(1)->tensor->format) : ""), + fmt::format("{}", EnumToString(schedOp->OFM()->tensor->format)), + // Data types + fmt::format("{}", EnumToString(schedOp->IFM(0)->tensor->dataType)), + fmt::format("{}", schedOp->TryIFM(1) ? EnumToString(schedOp->IFM(1)->tensor->dataType) : ""), + fmt::format("{}", EnumToString(schedOp->OFM()->tensor->dataType)), + // IFM Buffering + std::to_string(schedOp->IFM(0)->preBuffer), + schedOp->TryIFM(1) ? std::to_string(schedOp->IFM(1)->preBuffer) : "", + EnumToString(schedOp->IFM(0)->buffering), + schedOp->TryIFM(1) ? EnumToString(schedOp->IFM(1)->buffering) : "", + // Transpose and Reverse Types + EnumToString(schedOp->OFM()->transpose), + EnumToString(schedOp->OFM()->reverse), + // Timeindex + std::to_string(cost->timeIndex), + // Cascade + std::to_string(cost->cascade), + // Weights + cost->npuWeightsTensor ? cost->npuWeightsTensor->config->Format().ToString() : "", + cost->npuWeightsTensor ? EnumToString(cost->npuWeightsTensor->dataType) : "", + cost->npuWeightsTensor ? std::to_string(cost->npuWeightsTensor->totalWeightBytes) : "", + cost->npuWeightsTensor ? std::to_string(cost->npuWeightsTensor->maxRangeBytes) : "", + cost->npuWeightsTensor ? std::to_string(cost->npuWeightsTensor->subStreams) : "", + cost->npuWeightsTensor ? std::to_string(cost->npuWeightsTensor->distinctWeights) : "", + cost->npuWeightsTensor ? std::to_string(cost->npuWeightsTensor->zeroCount) : "", + // Scales + cost->npuScalesTensor ? EnumToString(cost->npuScalesTensor->dataType) : "", + cost->npuScalesTensor ? std::to_string(cost->npuScalesTensor->totalWeightBytes) : "", + cost->npuScalesTensor ? std::to_string(cost->npuScalesTensor->maxRangeBytes) : "", + // Weight Buffering + fmt::format("{}", fmt::join(cost->ofmDepthSlices, "|")), + cost->bufferedWeightTensor.tensor ? std::to_string(cost->bufferedWeightTensor.preBuffer) : "", + cost->bufferedWeightTensor.tensor ? EnumToString(cost->bufferedWeightTensor.buffering) : "", + cost->bufferedWeightTensor.tensor ? std::to_string(cost->fullWeightTransferCycles) : "", + // Kernel + std::to_string(schedOp->Kernel()->DepthMultiplier()), + std::to_string(schedOp->Kernel()->Padding().Top()), + std::to_string(schedOp->Kernel()->Padding().Bottom()), + std::to_string(schedOp->Kernel()->Padding().Left()), + std::to_string(schedOp->Kernel()->Padding().Right()), + std::to_string(schedOp->Kernel()->Padding().Near()), + std::to_string(schedOp->Kernel()->Padding().Far()), + std::to_string(schedOp->Kernel()->Size3D().x), + std::to_string(schedOp->Kernel()->Size3D().y), + std::to_string(schedOp->Kernel()->Size3D().z), + std::to_string(schedOp->Kernel()->Dilation3D().x), + std::to_string(schedOp->Kernel()->Dilation3D().y), + std::to_string(schedOp->Kernel()->Dilation3D().z), + std::to_string(schedOp->Kernel()->Stride3D().x), + std::to_string(schedOp->Kernel()->Stride3D().y), + std::to_string(schedOp->Kernel()->Stride3D().z), + }); + // clang-format on + for ( const auto mem : memories ) + { + // For all usages, add access read and access write: + for ( int i = 0; i < int(AccessType::Last); i++ ) + { + if ( perf.memory.at(mem).access.find(static_cast(i)) != perf.memory.at(mem).access.end() ) + { + row.push_back(std::to_string(perf.memory.at(mem).access.at(static_cast(i)).accessCycles)); + row.push_back(std::to_string(perf.memory.at(mem).access.at(static_cast(i)).bytesRead)); + row.push_back(std::to_string(perf.memory.at(mem).access.at(static_cast(i)).bytesWritten)); + } + else + { + row.emplace_back(""); + row.emplace_back(""); + row.emplace_back(""); + } + } + } + + db->AddRow(perfDebugTable, schedOp->Uid(), std::move(row)); } @@ -200,8 +400,23 @@ PerformanceResult NetworkPerformance::EstimateFullOpPerformance( PerformanceQuery query = Scheduler::InitPerfQuery(schedOp, cost->Config(), -1, wgtFormat); std::vector fused = Scheduler::InitFusionQuery(schedOp); + // Memory that NPU will source weights from for operations + ArchitectureMemory *weightsMemory = cost->npuWeightsTensor ? cost->npuWeightsTensor->memArea.memory : nullptr; + + _arch->Performance()->RecordToDB(schedOp->Uid()); CycleCost cycles = _arch->Performance()->MeasureCycleCost(query, fused); + if ( cost->npuWeightsTensor ) + { + WeightStats weightStats; + weightStats.size = cost->npuWeightsTensor->totalSourceBytes; + weightStats.encodedSize = cost->npuWeightsTensor->totalWeightBytes; + weightStats.zeroCount = cost->npuWeightsTensor->zeroCount; + weightStats.distinctWeights = cost->npuWeightsTensor->distinctWeights; + _arch->Performance()->RecordToDB(schedOp->Uid()); + _arch->Performance()->WeightDecodeCycles(query, weightStats, query.weightFormat, weightsMemory); + } + PerformanceResult result; result.npuCycles = cycles.opCycles; result.macCount = cycles.macs; @@ -238,9 +453,6 @@ PerformanceResult NetworkPerformance::EstimateFullOpPerformance( } } - // Memory that NPU will source weights from for operations - ArchitectureMemory *weightsMemory = cost->npuWeightsTensor ? cost->npuWeightsTensor->memArea.memory : nullptr; - if ( weightsMemory && cost->bufferedWeightTensor.tensor ) { // DMA Weight Transfer diff --git a/ethosu/regor/compiler/network_performance.hpp b/ethosu/regor/compiler/network_performance.hpp index 6e0b942c..88f2be0b 100644 --- a/ethosu/regor/compiler/network_performance.hpp +++ b/ethosu/regor/compiler/network_performance.hpp @@ -32,12 +32,13 @@ namespace regor /// /// Performance information for a whole schedule /// -enum AccessType +enum class AccessType { Lut = 0, FeatureMap = 1, Weights = 2, Scales = 3, + Last, }; struct PerformanceResult @@ -146,8 +147,8 @@ private: SchedulerOperation *prevOp, SchedulerOpInfo *prevCost, const std::unordered_set &memories); PerformanceResult EstimateFullOpPerformance( SchedulerOperation *schedOp, SchedulerOpInfo *cost, SchedulerOperation *prevOp, SchedulerOpInfo *prevCost); - void AddToDatabase(const PerformanceResult &perf, SchedulerOperation *schedOp, int opTable, int columns, - const std::unordered_set &memories, OptimiserDatabase *optDb); + void AddToDatabase(const PerformanceResult &perf, SchedulerOperation *schedOp, SchedulerOpInfo *cost, int opTable, + int perfDebugTable, const std::unordered_set &memories, OptimiserDatabase *optDb); }; diff --git a/ethosu/regor/compiler/scheduler_operation.cpp b/ethosu/regor/compiler/scheduler_operation.cpp new file mode 100644 index 00000000..c00b9eba --- /dev/null +++ b/ethosu/regor/compiler/scheduler_operation.cpp @@ -0,0 +1,29 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "compiler/scheduler_operation.hpp" + +#include "common/logging.hpp" + +#include "common/bit_flags.hpp" + +BEGIN_ENUM_TABLE(regor::Buffering) + ADD_ENUM_NAME(None) + ADD_ENUM_NAME(Single) + ADD_ENUM_NAME(Double) +END_ENUM_TABLE() diff --git a/ethosu/regor/compiler/shape_util.hpp b/ethosu/regor/compiler/shape_util.hpp index 4a695147..5563c5fb 100644 --- a/ethosu/regor/compiler/shape_util.hpp +++ b/ethosu/regor/compiler/shape_util.hpp @@ -71,4 +71,17 @@ inline Shape ReshapeTo3DAroundEdges(const Shape &shape, int minAxis = 1) return ReshapeTo3D(shape, {1, shape.Size() - 2, 1}, minAxis); } +inline Shape ReshapeToNHWC(Shape shape) +{ + if ( !shape.IsValid() ) + { + shape = {0, 0, 0, 0}; + } + int batch = shape.AxisProduct(0, shape.Size() - 3); + shape = Shape::PadAxes(shape, 4, 1).Extract(0, -3, -2, -1); + shape[0] = batch; + return shape; +} + + } // namespace regor -- GitLab