From 995ea8934c989b46a08d362e21655876ae29cd9c Mon Sep 17 00:00:00 2001 From: Johan Gunnarsson Date: Mon, 26 May 2025 12:22:03 +0200 Subject: [PATCH] MLBEDSW-10617: Move ReplacePadByExplicitPadding to GraphIR * Move the ReplacePadByExplicitPadding step to GraphIR optimiser so it can run on TOSA networks as well. TOSA PAD with pad_const = 0 can make use of this optimization. * Add unit test. Signed-off-by: Johan Gunnarsson Change-Id: I7baee6e82ae420268861c48d9d36e7ae39d7b9fe --- ethosu/regor/compiler/graphir_optimiser.cpp | 79 +++++++++++++++++++ ethosu/regor/compiler/graphir_optimiser.hpp | 2 + .../regor/compiler/tflite_graph_optimiser.cpp | 69 ---------------- .../regor/compiler/tflite_graph_optimiser.hpp | 8 -- ethosu/regor/test/test_graphir_optimiser.cpp | 59 ++++++++++++++ ethosu/regor/test/util.cpp | 10 +++ ethosu/regor/test/util.hpp | 4 + 7 files changed, 154 insertions(+), 77 deletions(-) diff --git a/ethosu/regor/compiler/graphir_optimiser.cpp b/ethosu/regor/compiler/graphir_optimiser.cpp index 9f36244e..ca73a848 100644 --- a/ethosu/regor/compiler/graphir_optimiser.cpp +++ b/ethosu/regor/compiler/graphir_optimiser.cpp @@ -731,6 +731,85 @@ Operation *GraphIrOptimiser::MakeFillOperation(TensorConnection *const ofmConn, return fillOp.get(); } +// Tries to completely remove a PAD operator by using explicit padding. +// E.g. a PAD operation that pads 1, followed by a CONV with VALID padding and kernel size 3 +// is rewritten such that the PAD is removed, and the CONV uses explicit padding. +// Converts tens1 -> PAD -> tens2 -> CONV to tens1 -> CONV +// This is the most efficient way to implement PAD, but cannot be done for all pad sizes. +Operation *GraphIrOptimiser::ReplacePadByExplicitPadding(Graph *const graph, Operation *const operation) +{ + UNUSED(graph); + const OpType opType = operation->Type(); + if ( IsConvolution(opType) && opType != OpType::TransposeConv2D && operation->Kernel()->Padding().IsZero() ) + { + const auto &producers = operation->IFM(0)->Writers(); + if ( producers.size() != 1 ) + { + // IFM has multiple producers + return operation; + } + + const auto &padOp = producers.front(); + if ( padOp->Type() != OpType::Pad || padOp->Attribute()->pad_const != 0 ) + { + // Not a pad or not padding with zeros + return operation; + } + + const auto padIfmConn = padOp->Input(TensorUsage::IFM0); + const auto padOfmConn = padOp->Output(TensorUsage::OFM); + const auto &padIfm = padIfmConn->tensor; + const auto &padOfm = padOfmConn->tensor; + if ( padIfm->Type() != padOfm->Type() || !IsScalingValidAndEqual(*padIfmConn, *padOfmConn) ) + { + // Different data types or different scaling + return operation; + } + + const auto padParamConn = padOp->Input(TensorUsage::Params); + const auto &padIfmShape = padIfmConn->SliceShape(); + const auto beforePad = TensorToShape(padParamConn->tensor.get(), padIfmShape.Size(), 2, 0); + const auto afterPad = TensorToShape(padParamConn->tensor.get(), padIfmShape.Size(), 2, 1); + if ( beforePad.WithHW(0, 0) != beforePad.WithZeros() || afterPad.WithHW(0, 0) != afterPad.WithZeros() ) + { + // Pad in other dimensions than height and width + return operation; + } + + int top = beforePad.Height(); + int left = beforePad.Width(); + int bottom = afterPad.Height(); + int right = afterPad.Width(); + const auto &k = operation->Kernel(); + const auto &kwh = k->DilatedWH(); + auto CalcPadAfter = [](int inputSize, int stride, int filterSize, int padBefore, int padAfter) -> int + { + const int totalPadding = NeededTotalPadding(inputSize, stride, filterSize); + // The bottom/right padding might need downward adjustment depending on stride/input size + const int remainderDiff = padAfter % stride - (totalPadding - padBefore) % stride; + return std::max(0, padAfter - remainderDiff - (remainderDiff >= 0 ? 0 : stride)); + }; + // Adjust the padding attributes of the convolution operator + bottom = CalcPadAfter(padIfmShape.Height(), k->Stride().y, kwh.y, top, bottom); + right = CalcPadAfter(padIfmShape.Width(), k->Stride().x, kwh.x, left, right); + if ( left >= kwh.x || right >= kwh.x || top >= kwh.y || bottom >= kwh.y ) + { + // Pad greater than or equal to kernel + return operation; + } + + const auto kernel = k->WithPadding({top, left, bottom, right}); + operation->SetKernel(std::make_unique(std::move(kernel))); + operation->CopyInput(TensorUsage::IFM0, *padIfmConn); + if ( padOfm->Readers().empty() ) + { + // Bypass the PAD operator + padOp->Disconnect(); + } + } + return operation; +} + Operation *GraphIrOptimiser::RewritePad(Graph *const, Operation *const operation) { Operation *returnOp = operation; diff --git a/ethosu/regor/compiler/graphir_optimiser.hpp b/ethosu/regor/compiler/graphir_optimiser.hpp index 31ac099d..1dd25cdf 100644 --- a/ethosu/regor/compiler/graphir_optimiser.hpp +++ b/ethosu/regor/compiler/graphir_optimiser.hpp @@ -52,6 +52,7 @@ private: Operation *RewriteRescaleInputs(Graph *const graph, Operation *const operation); Operation *RemoveRescaleUnsignedAttribute(Graph *const graph, Operation *const operation); Operation *RewriteRescale(Graph *const graph, Operation *const operation); + Operation *ReplacePadByExplicitPadding(Graph *const graph, Operation *const operation); Operation *RewritePad(Graph *const graph, Operation *const operation); Operation *FuseRescale(Graph *const graph, Operation *const operation); Operation *RewriteTable(Graph *const graph, Operation *const operation); @@ -143,6 +144,7 @@ private: &GraphIrOptimiser::ConvertResizeOffsets, &GraphIrOptimiser::RewriteFullyConnected, &GraphIrOptimiser::FixupPoolStrides, + &GraphIrOptimiser::ReplacePadByExplicitPadding, &GraphIrOptimiser::RewritePad, &GraphIrOptimiser::RewriteTable, &GraphIrOptimiser::RewriteCast, diff --git a/ethosu/regor/compiler/tflite_graph_optimiser.cpp b/ethosu/regor/compiler/tflite_graph_optimiser.cpp index cb069b8c..5570a264 100644 --- a/ethosu/regor/compiler/tflite_graph_optimiser.cpp +++ b/ethosu/regor/compiler/tflite_graph_optimiser.cpp @@ -2452,75 +2452,6 @@ BufferReader TFLiteGraphOptimiser::GetPadValuesFromTensor(const std::shared return padValues; } -// Based on explicit padding provided in a PAD operation, returns adjusted value for -// padAfter that provides equivalent results when used with explicit padding -int TFLiteGraphOptimiser::CalcPadAfter(int inputSize, int stride, int filterSize, int padBefore, int padAfter) -{ - int totalPadding = NeededTotalPadding(inputSize, stride, filterSize); - // The bottom/right padding might need downward adjustment depending on stride/input size - int remainderDiff = padAfter % stride - (totalPadding - padBefore) % stride; - return std::max(0, padAfter - remainderDiff - (remainderDiff >= 0 ? 0 : stride)); -} - -// Tries to completely remove a PAD operator by using explicit padding. -// E.g. a PAD operation that pads 1, followed by a CONV with VALID padding and kernel size 3 -// is rewritten such that the PAD is removed, and the CONV uses explicit padding. -// Converts tens1 -> PAD -> tens2 -> CONV to tens1 -> CONV -// This is the most efficient way to implement PAD, but cannot be done for all pad sizes. -Operation *TFLiteGraphOptimiser::ReplacePadByExplicitPadding(Graph *const graph, Operation *const operation) -{ - UNUSED(graph); - if ( IsConvolution(operation->Type()) && operation->Type() != OpType::TransposeConv2D && - operation->Kernel()->Padding().IsZero() && operation->IFM(0)->Writers().size() == 1 ) - { - // Potential for future optimization: in certain cases also Pad+AvgPool can be handled - // by changing to Depthwise. - auto padOp = operation->IFM(0)->Writers()[0].get(); - if ( padOp->Type() != OpType::Pad || padOp->Attribute()->pad_const != 0 ) - { - return operation; - } - auto padIfmConn = padOp->Input(TensorUsage::IFM0); - auto padOfmConn = padOp->Output(TensorUsage::OFM); - const auto &padIfm = padOp->IFM(0); - const auto &padOfm = padOp->OFM(); - - if ( padIfm->Type() != padOfm->Type() || !IsScalingValidAndEqual(*padIfmConn, *padOfmConn) ) - { - return operation; - } - auto padTensor = padOp->Input(TensorUsage::Params)->tensor; - BufferReader padValues = GetPadValuesFromTensor(padTensor); - int numPadValues = padTensor->View().Elements(); - int top = GetPadValue(padValues, numPadValues, PadAxis::Top); - int bottom = GetPadValue(padValues, numPadValues, PadAxis::Bottom); - int left = GetPadValue(padValues, numPadValues, PadAxis::Left); - int right = GetPadValue(padValues, numPadValues, PadAxis::Right); - - const auto &k = operation->Kernel(); - const auto &kwh = k->DilatedWH(); - if ( left + right >= kwh.x || top + bottom >= kwh.y ) - { - // Too much padding - return operation; - } - const auto &ifmShape = padOp->Input(TensorUsage::IFM0)->shape; - int bottomPad = CalcPadAfter(ifmShape.Height(), k->Stride().y, kwh.y, top, bottom); - int rightPad = CalcPadAfter(ifmShape.Width(), k->Stride().x, kwh.x, left, right); - // Adjust the padding attributes of the convolution operator - auto kernel = std::make_unique( - Kernel(k->Size(), k->Stride(), k->Dilation(), k->DepthMultiplier(), Margin(top, left, bottomPad, rightPad))); - operation->SetKernel(std::move(kernel)); - operation->CopyInput(TensorUsage::IFM0, *(padOp->Input(TensorUsage::IFM0))); - if ( padOfm->Readers().empty() ) - { - // Bypass the PAD operator - padOp->Disconnect(); - } - } - return operation; -} - // Lower PadV2 to TOSA Pad Operation *TFLiteGraphOptimiser::ConvertPadV2(Graph *const graph, Operation *const operation) { diff --git a/ethosu/regor/compiler/tflite_graph_optimiser.hpp b/ethosu/regor/compiler/tflite_graph_optimiser.hpp index 088fe3ef..81f2359b 100644 --- a/ethosu/regor/compiler/tflite_graph_optimiser.hpp +++ b/ethosu/regor/compiler/tflite_graph_optimiser.hpp @@ -161,13 +161,6 @@ private: // padAfter that provides equivalent results when used with explicit padding int CalcPadAfter(int inputSize, int stride, int filterSize, int padBefore, int padAfter); - // Tries to completely remove a PAD operator by using explicit padding. - // E.g. a PAD operation that pads 1, followed by a CONV with VALID padding and kernel size 3 - // is rewritten such that the PAD is removed, and the CONV uses explicit padding. - // Converts tens1 -> PAD -> tens2 -> CONV to tens1 -> CONV - // This is the most efficient way to implement PAD, but cannot be done for all pad sizes. - Operation *ReplacePadByExplicitPadding(Graph *const graph, Operation *const operation); - // Lower PadV2 to TOSA Pad Operation *ConvertPadV2(Graph *const graph, Operation *const operation); @@ -259,7 +252,6 @@ public: &TFLiteGraphOptimiser::ConvertTanhSigmoidToLUT, &TFLiteGraphOptimiser::ConvertSoftmaxOps, &TFLiteGraphOptimiser::ConvertLstmOps, - &TFLiteGraphOptimiser::ReplacePadByExplicitPadding, &TFLiteGraphOptimiser::ConvertMeanOps, &TFLiteGraphOptimiser::ConvertPrelu, &TFLiteGraphOptimiser::ConvertLeakyRelu, diff --git a/ethosu/regor/test/test_graphir_optimiser.cpp b/ethosu/regor/test/test_graphir_optimiser.cpp index 4a2289c8..f0c64341 100644 --- a/ethosu/regor/test/test_graphir_optimiser.cpp +++ b/ethosu/regor/test/test_graphir_optimiser.cpp @@ -260,3 +260,62 @@ TEST_CASE("test_graphir_optimiser - transpose merge") REQUIRE(allOps.back()->Type() == OpType::Add); REQUIRE(allOps.front()->Output(TensorUsage::OFM)->tensor == allOps.back()->Input(TensorUsage::IFM)->tensor); } + +TEST_CASE("test_graphir_optimiser - replace pad by explicit padding") +{ + // Create arch + auto arch = CreateArchDefault(); + std::string err = "noerror"; + arch->CheckConfiguration(err); + REQUIRE(err == "noerror"); + + // Constant data for the Pad op's paddings tensor + std::vector paddings = {{ + 0, + 0, + 1 /* top */, + 4 /* bottom*/, + 3 /* left */, + 2 /* right */, + 0, + 0, + }}; + + std::vector> ops; + auto padIfm = CreateTensor("INPUT", Shape(1, 7, 7, 3), DataType::Int8, 1); + auto padParam = CreateTensor("PADPARAM", Shape(8), DataType::Int8, std::move(paddings)); + auto padOfm = CreateTensor("PADOFM", Shape(1, 12, 12, 3), DataType::Int8); + auto convWeights = CreateTensor("WEIGHTS", Shape(1, 6, 6, 9), DataType::Int8, 42); + auto convBias = CreateTensor("BIAS", Shape(1, 1, 1, 9), DataType::Int8, 0); + auto convOfm = CreateTensor("OUTPUT", Shape(1, 7, 7, 9), DataType::Int8); + + // Create Pad op + ops.push_back(CreateOperation(OpType::Pad, TensorUsage::IFM, padIfm, TensorUsage::Params, padParam, TensorUsage::OFM, padOfm)); + pad_attr_t *attr = ops.back()->Attribute(); + attr->pad_const = 0; + + // Create Conv2D op + ops.push_back(CreateOperation(OpType::Conv2D, TensorUsage::IFM, padOfm, TensorUsage::Weights, convWeights, + TensorUsage::Scales, convBias, TensorUsage::OFM, convOfm)); + Kernel kernel = Kernel::UnitKernel().WithSize({6, 6}); + ops.back()->SetKernel(std::make_unique(std::move(kernel))); + + auto graph = CreateGraph(ops); + + GraphOptimiserOptions options; + auto optimiser = GraphOptimiser::MakeGraphOptimiser(graph->Notation(), arch.get(), options, nullptr); + + optimiser->Process(graph.get()); + + std::vector allOps; + graph->GetAllOperations(allOps); + REQUIRE(allOps.size() == 1); + REQUIRE(allOps[0]->Type() == OpType::Conv2D); + auto &padding = allOps[0]->Kernel()->Padding(); + REQUIRE(padding.Top() == 1); + REQUIRE(padding.Left() == 3); + REQUIRE(padding.Bottom() == 4); + REQUIRE(padding.Right() == 2); + REQUIRE(padding.Near() == 0); + REQUIRE(padding.Far() == 0); +} diff --git a/ethosu/regor/test/util.cpp b/ethosu/regor/test/util.cpp index 7ee08773..f837a527 100644 --- a/ethosu/regor/test/util.cpp +++ b/ethosu/regor/test/util.cpp @@ -177,6 +177,16 @@ std::shared_ptr CreateOperation(OpType opType, TensorUsage ifmUsage, return op; } +// Create a Operation with three inputs +std::shared_ptr CreateOperation(OpType opType, TensorUsage ifmUsage, std::shared_ptr &ifm, + TensorUsage ifm2Usage, std::shared_ptr &ifm2, TensorUsage ifm3Usage, std::shared_ptr &ifm3, + TensorUsage ofmUsage, std::shared_ptr &ofm) +{ + auto op = CreateOperation(opType, ifmUsage, ifm, ifm2Usage, ifm2, ofmUsage, ofm); + op->ConnectInput(ifm3Usage, ifm3).Set(Quantization::Unit()); + return op; +} + // Helpers for Scheduler IR // ----------------------------- // Create a SchedulerTensor with name, storageshape and datatype diff --git a/ethosu/regor/test/util.hpp b/ethosu/regor/test/util.hpp index 247329af..674907ce 100644 --- a/ethosu/regor/test/util.hpp +++ b/ethosu/regor/test/util.hpp @@ -81,6 +81,10 @@ std::shared_ptr CreateOperation(OpType opType, TensorUsage ifmUsage, // Create a Operation with binary input std::shared_ptr CreateOperation(OpType opType, TensorUsage ifmUsage, std::shared_ptr &ifm, TensorUsage ifm2Usage, std::shared_ptr &ifm2, TensorUsage ofmUsage, std::shared_ptr &ofm); +// Create a Operation with three inputs +std::shared_ptr CreateOperation(OpType opType, TensorUsage ifmUsage, std::shared_ptr &ifm, + TensorUsage ifm2Usage, std::shared_ptr &ifm2, TensorUsage ifm3Usage, std::shared_ptr &ifm3, + TensorUsage ofmUsage, std::shared_ptr &ofm); // Helpers for Scheduler IR // ----------------------------- -- GitLab