From 9358b828f0d0b82171890e984f94b37f63ca4385 Mon Sep 17 00:00:00 2001 From: Johan Gunnarsson Date: Wed, 4 Jun 2025 16:34:14 +0200 Subject: [PATCH 1/2] MLBEDSW-10894: Add a perf_debug_conn table to debug database The table describes connectivity between ops in Scheduler IR. Each row describes one input of an op. It contains 3 fields: * id: The scheduler IR operation ID. * input_op_id: The scheduler IR operation ID of a node producing an input. * input_index: The IFM index that is consuming this input. Two special values are used to describe graph inputs and graph outputs. "-2" is used in this "id" field for graph outputs. "-1" is used in the "input_op_id" field for graph inputs. A network that looks like this... [T1] -> (A) -> [T2] -> (B) -> [T3] ...where T1, T2, T3 are tensors, T1 is graph input, T3 is graph output, A (ID 0), B (ID 1) are operations, will have a perf_debug_conn table that looks like this: id, input_op_id, input_index 0, -1, 0 1, 0, 0 -2, 1, 0 Signed-off-by: Johan Gunnarsson Change-Id: Ia8cd8cad35ecf2d3e58987c651856a7d6ea8401b --- ethosu/regor/compiler/network_performance.cpp | 58 ++++++++++++++++--- ethosu/regor/compiler/network_performance.hpp | 4 +- ethosu/regor/compiler/scheduler_operation.hpp | 2 + 3 files changed, 53 insertions(+), 11 deletions(-) diff --git a/ethosu/regor/compiler/network_performance.cpp b/ethosu/regor/compiler/network_performance.cpp index 1f2db8e9..1e935557 100644 --- a/ethosu/regor/compiler/network_performance.cpp +++ b/ethosu/regor/compiler/network_performance.cpp @@ -52,14 +52,15 @@ PerformanceResult NetworkPerformance::Measure(Schedule *schedule, OptimiserDatab std::unordered_set regions( {_arch->ReadonlyMemory(), _arch->FeatureMapMemory(), _arch->LUTMemory(), _arch->StagingMemory()}); std::unordered_set tensorUids; - int opTable = 0; + int perfTable = 0; int perfDebugTable = 0; + int perfDebugConnectivityTable = 0; if ( optDb ) { db = optDb->Get(); _arch->Performance()->InitDatabase(db); - opTable = db->AddTable("perf"); + perfTable = db->AddTable("perf"); std::vector columns = { "source_id", "optimised_id", @@ -75,7 +76,7 @@ PerformanceResult NetworkPerformance::Measure(Schedule *schedule, OptimiserDatab std::string label = mem->Name() + "_ac"; columns.push_back(label); } - db->AddColumns(opTable, columns); + db->AddColumns(perfTable, columns); perfDebugTable = db->AddTable("perf_debug"); @@ -174,6 +175,14 @@ PerformanceResult NetworkPerformance::Measure(Schedule *schedule, OptimiserDatab } db->AddColumns(perfDebugTable, std::move(columns)); + + perfDebugConnectivityTable = db->AddTable("perf_debug_conn"); + + columns = { + "input_op_id", + "input_index", + }; + db->AddColumns(perfDebugConnectivityTable, std::move(columns)); } for ( auto const &schedOp : _ops ) @@ -196,7 +205,7 @@ PerformanceResult NetworkPerformance::Measure(Schedule *schedule, OptimiserDatab } if ( optDb != nullptr ) { - AddToDatabase(perf, schedOp.get(), cost, opTable, perfDebugTable, memories, optDb); + AddToDatabase(perf, schedOp.get(), cost, perfTable, perfDebugTable, perfDebugConnectivityTable, memories, optDb); } performance += perf; prevOp = schedOp.get(); @@ -208,7 +217,7 @@ PerformanceResult NetworkPerformance::Measure(Schedule *schedule, OptimiserDatab perf = ProcessOpPerformance(subOp.get(), cost, schedule, prevOp, prevCost, memories); if ( optDb != nullptr ) { - AddToDatabase(perf, subOp.get(), cost, opTable, perfDebugTable, memories, optDb); + AddToDatabase(perf, subOp.get(), cost, perfTable, perfDebugTable, perfDebugConnectivityTable, memories, optDb); } if ( !IsActivation(subOp->Type()) ) { @@ -257,8 +266,8 @@ PerformanceResult NetworkPerformance::ProcessOpPerformance(SchedulerOperation *s } -void NetworkPerformance::AddToDatabase(const PerformanceResult &perf, SchedulerOperation *schedOp, SchedulerOpInfo *cost, - int opTable, int perfDebugTable, const std::unordered_set &memories, OptimiserDatabase *optDb) +void NetworkPerformance::AddToDatabase(const PerformanceResult &perf, SchedulerOperation *schedOp, SchedulerOpInfo *cost, int perfTable, + int perfDebugTable, int perfDebugConnectivityTable, const std::unordered_set &memories, OptimiserDatabase *optDb) { // Per-layer calculations assert(optDb != nullptr); @@ -290,7 +299,7 @@ void NetworkPerformance::AddToDatabase(const PerformanceResult &perf, SchedulerO row.push_back(std::to_string(perf.memory.at(mem).accessCycles)); } - db->AddRow(opTable, schedOp->Uid(), std::move(row)); + db->AddRow(perfTable, schedOp->Uid(), std::move(row)); row = {}; auto shapeToStrings = [&row](const std::vector &shape) @@ -298,7 +307,7 @@ void NetworkPerformance::AddToDatabase(const PerformanceResult &perf, SchedulerO std::transform(shape.begin(), shape.end(), std::back_inserter(row), [](int n) -> std::string { return n ? std::to_string(n) : ""; }); }; - // clang-format off + // FM shapes shapeToStrings(ReshapeToNHWC(schedOp->IFM(0)->shape).ToList()); shapeToStrings(ReshapeToNHWC(schedOp->TryIFM(1) ? schedOp->IFM(1)->shape : Shape()).ToList()); @@ -312,6 +321,7 @@ void NetworkPerformance::AddToDatabase(const PerformanceResult &perf, SchedulerO shapeToStrings(ReshapeToNHWC(schedOp->TryIFM(1) ? cost->stripeInput[1] : Shape()).ToList()); shapeToStrings(ReshapeToNHWC(cost->stripe).ToList()); + // clang-format off row.insert(row.end(), { // FM Memory fmt::format("{}", schedOp->IFM(0)->tensor->memArea.memory->Name()), @@ -373,6 +383,7 @@ void NetworkPerformance::AddToDatabase(const PerformanceResult &perf, SchedulerO std::to_string(schedOp->Kernel()->Stride3D().z), }); // clang-format on + for ( const auto mem : memories ) { // Add read/write transferEfficiencies for all memories @@ -397,6 +408,35 @@ void NetworkPerformance::AddToDatabase(const PerformanceResult &perf, SchedulerO } db->AddRow(perfDebugTable, schedOp->Uid(), std::move(row)); + + // Store graph connectivity + if ( perfDebugConnectivityTable ) + { + for ( auto [usage, ifmConn] : schedOp->inputs.pairs() ) + { + if ( !IsIFM(usage) ) continue; + + const auto index = GetUsageIndex(usage); + if ( ifmConn.tensor->isGraphInput ) + { + db->AddRow(perfDebugConnectivityTable, schedOp->Uid(), {std::to_string(-1), std::to_string(index)}); + } + for ( auto &prod : ifmConn.tensor->producers ) + { + db->AddRow(perfDebugConnectivityTable, schedOp->Uid(), {std::to_string(prod->Uid()), std::to_string(index)}); + } + } + + for ( auto [usage, ofmConn] : schedOp->outputs.pairs() ) + { + if ( !IsOFM(usage) ) continue; + + if ( ofmConn.tensor->isGraphOutput ) + { + db->AddRow(perfDebugConnectivityTable, -2, {std::to_string(schedOp->Uid()), std::to_string(0)}); + } + } + } } diff --git a/ethosu/regor/compiler/network_performance.hpp b/ethosu/regor/compiler/network_performance.hpp index b379e0e1..4e41ff8a 100644 --- a/ethosu/regor/compiler/network_performance.hpp +++ b/ethosu/regor/compiler/network_performance.hpp @@ -143,8 +143,8 @@ private: SchedulerOperation *prevOp, SchedulerOpInfo *prevCost, const std::unordered_set &memories); PerformanceResult EstimateFullOpPerformance( SchedulerOperation *schedOp, SchedulerOpInfo *cost, SchedulerOperation *prevOp, SchedulerOpInfo *prevCost); - void AddToDatabase(const PerformanceResult &perf, SchedulerOperation *schedOp, SchedulerOpInfo *cost, int opTable, - int perfDebugTable, const std::unordered_set &memories, OptimiserDatabase *optDb); + void AddToDatabase(const PerformanceResult &perf, SchedulerOperation *schedOp, SchedulerOpInfo *cost, int opTable, int perfDebugTable, + int perfDebugConnectivityTable, const std::unordered_set &memories, OptimiserDatabase *optDb); }; diff --git a/ethosu/regor/compiler/scheduler_operation.hpp b/ethosu/regor/compiler/scheduler_operation.hpp index 3b5ce98f..0ced982c 100644 --- a/ethosu/regor/compiler/scheduler_operation.hpp +++ b/ethosu/regor/compiler/scheduler_operation.hpp @@ -84,6 +84,8 @@ public: std::shared_ptr Clone() const { auto clone = std::make_shared(*this); + clone->isGraphInput = false; // Cloned tensor is never graph input + clone->isGraphOutput = false; // Cloned tensor is never graph output clone->uid = GenerateUniqueId(); clone->equivalenceId = GenerateUniqueId(); clone->consumers.clear(); -- GitLab From 8d63671619e14f57128ff1e47a99d296eccff3b6 Mon Sep 17 00:00:00 2001 From: Johan Gunnarsson Date: Tue, 10 Jun 2025 17:22:12 +0200 Subject: [PATCH 2/2] MLBEDSW-10893: Properly call RecordOptimisation() There is one spot in RewriteSpaceToBatchConvBatchToSpace where this call was missing and lead to incorrect mapping between source op and optimised op in the debug database. Signed-off-by: Johan Gunnarsson Change-Id: I7e844750b70b0d4ee1f3f7e52f581eced730a537 --- ethosu/regor/compiler/tflite_graph_optimiser.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/ethosu/regor/compiler/tflite_graph_optimiser.cpp b/ethosu/regor/compiler/tflite_graph_optimiser.cpp index 5570a264..78b1bed0 100644 --- a/ethosu/regor/compiler/tflite_graph_optimiser.cpp +++ b/ethosu/regor/compiler/tflite_graph_optimiser.cpp @@ -1581,6 +1581,7 @@ Operation *TFLiteGraphOptimiser::RewriteSpaceToBatchConvBatchToSpace(Graph *cons if ( _supportedOps->Check(newOp.get()) ) { returnOp = newOp.get(); + RecordOptimisation(operation, returnOp); // Disconnect matched pattern prevOp->Disconnect(); nextOp->Disconnect(); -- GitLab