From 995ea8934c989b46a08d362e21655876ae29cd9c Mon Sep 17 00:00:00 2001
From: Johan Gunnarsson <johan.gunnarsson@arm.com>
Date: Mon, 26 May 2025 12:22:03 +0200
Subject: [PATCH] MLBEDSW-10617: Move ReplacePadByExplicitPadding to GraphIR

* Move the ReplacePadByExplicitPadding step to GraphIR optimiser so
  it can run on TOSA networks as well. TOSA PAD with pad_const = 0 can
  make use of this optimization.
* Add unit test.

Signed-off-by: Johan Gunnarsson <johan.gunnarsson@arm.com>
Change-Id: I7baee6e82ae420268861c48d9d36e7ae39d7b9fe
---
 ethosu/regor/compiler/graphir_optimiser.cpp   | 79 +++++++++++++++++++
 ethosu/regor/compiler/graphir_optimiser.hpp   |  2 +
 .../regor/compiler/tflite_graph_optimiser.cpp | 69 ----------------
 .../regor/compiler/tflite_graph_optimiser.hpp |  8 --
 ethosu/regor/test/test_graphir_optimiser.cpp  | 59 ++++++++++++++
 ethosu/regor/test/util.cpp                    | 10 +++
 ethosu/regor/test/util.hpp                    |  4 +
 7 files changed, 154 insertions(+), 77 deletions(-)

diff --git a/ethosu/regor/compiler/graphir_optimiser.cpp b/ethosu/regor/compiler/graphir_optimiser.cpp
index 9f36244e..ca73a848 100644
--- a/ethosu/regor/compiler/graphir_optimiser.cpp
+++ b/ethosu/regor/compiler/graphir_optimiser.cpp
@@ -731,6 +731,85 @@ Operation *GraphIrOptimiser::MakeFillOperation(TensorConnection *const ofmConn,
     return fillOp.get();
 }
 
+// Tries to completely remove a PAD operator by using explicit padding.
+// E.g. a PAD operation that pads 1, followed by a CONV with VALID padding and kernel size 3
+// is rewritten such that the PAD is removed, and the CONV uses explicit padding.
+// Converts tens1 -> PAD -> tens2 -> CONV to tens1 -> CONV
+// This is the most efficient way to implement PAD, but cannot be done for all pad sizes.
+Operation *GraphIrOptimiser::ReplacePadByExplicitPadding(Graph *const graph, Operation *const operation)
+{
+    UNUSED(graph);
+    const OpType opType = operation->Type();
+    if ( IsConvolution(opType) && opType != OpType::TransposeConv2D && operation->Kernel()->Padding().IsZero() )
+    {
+        const auto &producers = operation->IFM(0)->Writers();
+        if ( producers.size() != 1 )
+        {
+            // IFM has multiple producers
+            return operation;
+        }
+
+        const auto &padOp = producers.front();
+        if ( padOp->Type() != OpType::Pad || padOp->Attribute<pad_attr_t>()->pad_const != 0 )
+        {
+            // Not a pad or not padding with zeros
+            return operation;
+        }
+
+        const auto padIfmConn = padOp->Input(TensorUsage::IFM0);
+        const auto padOfmConn = padOp->Output(TensorUsage::OFM);
+        const auto &padIfm = padIfmConn->tensor;
+        const auto &padOfm = padOfmConn->tensor;
+        if ( padIfm->Type() != padOfm->Type() || !IsScalingValidAndEqual(*padIfmConn, *padOfmConn) )
+        {
+            // Different data types or different scaling
+            return operation;
+        }
+
+        const auto padParamConn = padOp->Input(TensorUsage::Params);
+        const auto &padIfmShape = padIfmConn->SliceShape();
+        const auto beforePad = TensorToShape(padParamConn->tensor.get(), padIfmShape.Size(), 2, 0);
+        const auto afterPad = TensorToShape(padParamConn->tensor.get(), padIfmShape.Size(), 2, 1);
+        if ( beforePad.WithHW(0, 0) != beforePad.WithZeros() || afterPad.WithHW(0, 0) != afterPad.WithZeros() )
+        {
+            // Pad in other dimensions than height and width
+            return operation;
+        }
+
+        int top = beforePad.Height();
+        int left = beforePad.Width();
+        int bottom = afterPad.Height();
+        int right = afterPad.Width();
+        const auto &k = operation->Kernel();
+        const auto &kwh = k->DilatedWH();
+        auto CalcPadAfter = [](int inputSize, int stride, int filterSize, int padBefore, int padAfter) -> int
+        {
+            const int totalPadding = NeededTotalPadding(inputSize, stride, filterSize);
+            // The bottom/right padding might need downward adjustment depending on stride/input size
+            const int remainderDiff = padAfter % stride - (totalPadding - padBefore) % stride;
+            return std::max(0, padAfter - remainderDiff - (remainderDiff >= 0 ? 0 : stride));
+        };
+        // Adjust the padding attributes of the convolution operator
+        bottom = CalcPadAfter(padIfmShape.Height(), k->Stride().y, kwh.y, top, bottom);
+        right = CalcPadAfter(padIfmShape.Width(), k->Stride().x, kwh.x, left, right);
+        if ( left >= kwh.x || right >= kwh.x || top >= kwh.y || bottom >= kwh.y )
+        {
+            // Pad greater than or equal to kernel
+            return operation;
+        }
+
+        const auto kernel = k->WithPadding({top, left, bottom, right});
+        operation->SetKernel(std::make_unique<Kernel>(std::move(kernel)));
+        operation->CopyInput(TensorUsage::IFM0, *padIfmConn);
+        if ( padOfm->Readers().empty() )
+        {
+            // Bypass the PAD operator
+            padOp->Disconnect();
+        }
+    }
+    return operation;
+}
+
 Operation *GraphIrOptimiser::RewritePad(Graph *const, Operation *const operation)
 {
     Operation *returnOp = operation;
diff --git a/ethosu/regor/compiler/graphir_optimiser.hpp b/ethosu/regor/compiler/graphir_optimiser.hpp
index 31ac099d..1dd25cdf 100644
--- a/ethosu/regor/compiler/graphir_optimiser.hpp
+++ b/ethosu/regor/compiler/graphir_optimiser.hpp
@@ -52,6 +52,7 @@ private:
     Operation *RewriteRescaleInputs(Graph *const graph, Operation *const operation);
     Operation *RemoveRescaleUnsignedAttribute(Graph *const graph, Operation *const operation);
     Operation *RewriteRescale(Graph *const graph, Operation *const operation);
+    Operation *ReplacePadByExplicitPadding(Graph *const graph, Operation *const operation);
     Operation *RewritePad(Graph *const graph, Operation *const operation);
     Operation *FuseRescale(Graph *const graph, Operation *const operation);
     Operation *RewriteTable(Graph *const graph, Operation *const operation);
@@ -143,6 +144,7 @@ private:
                 &GraphIrOptimiser::ConvertResizeOffsets,
                 &GraphIrOptimiser::RewriteFullyConnected,
                 &GraphIrOptimiser::FixupPoolStrides,
+                &GraphIrOptimiser::ReplacePadByExplicitPadding,
                 &GraphIrOptimiser::RewritePad,
                 &GraphIrOptimiser::RewriteTable,
                 &GraphIrOptimiser::RewriteCast,
diff --git a/ethosu/regor/compiler/tflite_graph_optimiser.cpp b/ethosu/regor/compiler/tflite_graph_optimiser.cpp
index cb069b8c..5570a264 100644
--- a/ethosu/regor/compiler/tflite_graph_optimiser.cpp
+++ b/ethosu/regor/compiler/tflite_graph_optimiser.cpp
@@ -2452,75 +2452,6 @@ BufferReader<int> TFLiteGraphOptimiser::GetPadValuesFromTensor(const std::shared
     return padValues;
 }
 
-// Based on explicit padding provided in a PAD operation, returns adjusted value for
-// padAfter that provides equivalent results when used with explicit padding
-int TFLiteGraphOptimiser::CalcPadAfter(int inputSize, int stride, int filterSize, int padBefore, int padAfter)
-{
-    int totalPadding = NeededTotalPadding(inputSize, stride, filterSize);
-    // The bottom/right padding might need downward adjustment depending on stride/input size
-    int remainderDiff = padAfter % stride - (totalPadding - padBefore) % stride;
-    return std::max(0, padAfter - remainderDiff - (remainderDiff >= 0 ? 0 : stride));
-}
-
-// Tries to completely remove a PAD operator by using explicit padding.
-// E.g. a PAD operation that pads 1, followed by a CONV with VALID padding and kernel size 3
-// is rewritten such that the PAD is removed, and the CONV uses explicit padding.
-// Converts tens1 -> PAD -> tens2 -> CONV to tens1 -> CONV
-// This is the most efficient way to implement PAD, but cannot be done for all pad sizes.
-Operation *TFLiteGraphOptimiser::ReplacePadByExplicitPadding(Graph *const graph, Operation *const operation)
-{
-    UNUSED(graph);
-    if ( IsConvolution(operation->Type()) && operation->Type() != OpType::TransposeConv2D &&
-         operation->Kernel()->Padding().IsZero() && operation->IFM(0)->Writers().size() == 1 )
-    {
-        // Potential for future optimization: in certain cases also Pad+AvgPool can be handled
-        // by changing to Depthwise.
-        auto padOp = operation->IFM(0)->Writers()[0].get();
-        if ( padOp->Type() != OpType::Pad || padOp->Attribute<pad_attr_t>()->pad_const != 0 )
-        {
-            return operation;
-        }
-        auto padIfmConn = padOp->Input(TensorUsage::IFM0);
-        auto padOfmConn = padOp->Output(TensorUsage::OFM);
-        const auto &padIfm = padOp->IFM(0);
-        const auto &padOfm = padOp->OFM();
-
-        if ( padIfm->Type() != padOfm->Type() || !IsScalingValidAndEqual(*padIfmConn, *padOfmConn) )
-        {
-            return operation;
-        }
-        auto padTensor = padOp->Input(TensorUsage::Params)->tensor;
-        BufferReader<int> padValues = GetPadValuesFromTensor(padTensor);
-        int numPadValues = padTensor->View().Elements();
-        int top = GetPadValue(padValues, numPadValues, PadAxis::Top);
-        int bottom = GetPadValue(padValues, numPadValues, PadAxis::Bottom);
-        int left = GetPadValue(padValues, numPadValues, PadAxis::Left);
-        int right = GetPadValue(padValues, numPadValues, PadAxis::Right);
-
-        const auto &k = operation->Kernel();
-        const auto &kwh = k->DilatedWH();
-        if ( left + right >= kwh.x || top + bottom >= kwh.y )
-        {
-            // Too much padding
-            return operation;
-        }
-        const auto &ifmShape = padOp->Input(TensorUsage::IFM0)->shape;
-        int bottomPad = CalcPadAfter(ifmShape.Height(), k->Stride().y, kwh.y, top, bottom);
-        int rightPad = CalcPadAfter(ifmShape.Width(), k->Stride().x, kwh.x, left, right);
-        // Adjust the padding attributes of the convolution operator
-        auto kernel = std::make_unique<Kernel>(
-            Kernel(k->Size(), k->Stride(), k->Dilation(), k->DepthMultiplier(), Margin(top, left, bottomPad, rightPad)));
-        operation->SetKernel(std::move(kernel));
-        operation->CopyInput(TensorUsage::IFM0, *(padOp->Input(TensorUsage::IFM0)));
-        if ( padOfm->Readers().empty() )
-        {
-            // Bypass the PAD operator
-            padOp->Disconnect();
-        }
-    }
-    return operation;
-}
-
 // Lower PadV2 to TOSA Pad
 Operation *TFLiteGraphOptimiser::ConvertPadV2(Graph *const graph, Operation *const operation)
 {
diff --git a/ethosu/regor/compiler/tflite_graph_optimiser.hpp b/ethosu/regor/compiler/tflite_graph_optimiser.hpp
index 088fe3ef..81f2359b 100644
--- a/ethosu/regor/compiler/tflite_graph_optimiser.hpp
+++ b/ethosu/regor/compiler/tflite_graph_optimiser.hpp
@@ -161,13 +161,6 @@ private:
     // padAfter that provides equivalent results when used with explicit padding
     int CalcPadAfter(int inputSize, int stride, int filterSize, int padBefore, int padAfter);
 
-    // Tries to completely remove a PAD operator by using explicit padding.
-    // E.g. a PAD operation that pads 1, followed by a CONV with VALID padding and kernel size 3
-    // is rewritten such that the PAD is removed, and the CONV uses explicit padding.
-    // Converts tens1 -> PAD -> tens2 -> CONV to tens1 -> CONV
-    // This is the most efficient way to implement PAD, but cannot be done for all pad sizes.
-    Operation *ReplacePadByExplicitPadding(Graph *const graph, Operation *const operation);
-
     // Lower PadV2 to TOSA Pad
     Operation *ConvertPadV2(Graph *const graph, Operation *const operation);
 
@@ -259,7 +252,6 @@ public:
                 &TFLiteGraphOptimiser::ConvertTanhSigmoidToLUT,
                 &TFLiteGraphOptimiser::ConvertSoftmaxOps,
                 &TFLiteGraphOptimiser::ConvertLstmOps,
-                &TFLiteGraphOptimiser::ReplacePadByExplicitPadding,
                 &TFLiteGraphOptimiser::ConvertMeanOps,
                 &TFLiteGraphOptimiser::ConvertPrelu,
                 &TFLiteGraphOptimiser::ConvertLeakyRelu,
diff --git a/ethosu/regor/test/test_graphir_optimiser.cpp b/ethosu/regor/test/test_graphir_optimiser.cpp
index 4a2289c8..f0c64341 100644
--- a/ethosu/regor/test/test_graphir_optimiser.cpp
+++ b/ethosu/regor/test/test_graphir_optimiser.cpp
@@ -260,3 +260,62 @@ TEST_CASE("test_graphir_optimiser - transpose merge")
     REQUIRE(allOps.back()->Type() == OpType::Add);
     REQUIRE(allOps.front()->Output(TensorUsage::OFM)->tensor == allOps.back()->Input(TensorUsage::IFM)->tensor);
 }
+
+TEST_CASE("test_graphir_optimiser - replace pad by explicit padding")
+{
+    // Create arch
+    auto arch = CreateArchDefault<ArchEthosU85>();
+    std::string err = "noerror";
+    arch->CheckConfiguration(err);
+    REQUIRE(err == "noerror");
+
+    // Constant data for the Pad op's paddings tensor
+    std::vector<int8_t> paddings = {{
+        0,
+        0,
+        1 /* top */,
+        4 /* bottom*/,
+        3 /* left */,
+        2 /* right */,
+        0,
+        0,
+    }};
+
+    std::vector<std::shared_ptr<Operation>> ops;
+    auto padIfm = CreateTensor("INPUT", Shape(1, 7, 7, 3), DataType::Int8, 1);
+    auto padParam = CreateTensor("PADPARAM", Shape(8), DataType::Int8, std::move(paddings));
+    auto padOfm = CreateTensor("PADOFM", Shape(1, 12, 12, 3), DataType::Int8);
+    auto convWeights = CreateTensor("WEIGHTS", Shape(1, 6, 6, 9), DataType::Int8, 42);
+    auto convBias = CreateTensor("BIAS", Shape(1, 1, 1, 9), DataType::Int8, 0);
+    auto convOfm = CreateTensor("OUTPUT", Shape(1, 7, 7, 9), DataType::Int8);
+
+    // Create Pad op
+    ops.push_back(CreateOperation(OpType::Pad, TensorUsage::IFM, padIfm, TensorUsage::Params, padParam, TensorUsage::OFM, padOfm));
+    pad_attr_t *attr = ops.back()->Attribute<pad_attr_t>();
+    attr->pad_const = 0;
+
+    // Create Conv2D op
+    ops.push_back(CreateOperation(OpType::Conv2D, TensorUsage::IFM, padOfm, TensorUsage::Weights, convWeights,
+        TensorUsage::Scales, convBias, TensorUsage::OFM, convOfm));
+    Kernel kernel = Kernel::UnitKernel().WithSize({6, 6});
+    ops.back()->SetKernel(std::make_unique<Kernel>(std::move(kernel)));
+
+    auto graph = CreateGraph(ops);
+
+    GraphOptimiserOptions options;
+    auto optimiser = GraphOptimiser::MakeGraphOptimiser(graph->Notation(), arch.get(), options, nullptr);
+
+    optimiser->Process(graph.get());
+
+    std::vector<Operation *> allOps;
+    graph->GetAllOperations(allOps);
+    REQUIRE(allOps.size() == 1);
+    REQUIRE(allOps[0]->Type() == OpType::Conv2D);
+    auto &padding = allOps[0]->Kernel()->Padding();
+    REQUIRE(padding.Top() == 1);
+    REQUIRE(padding.Left() == 3);
+    REQUIRE(padding.Bottom() == 4);
+    REQUIRE(padding.Right() == 2);
+    REQUIRE(padding.Near() == 0);
+    REQUIRE(padding.Far() == 0);
+}
diff --git a/ethosu/regor/test/util.cpp b/ethosu/regor/test/util.cpp
index 7ee08773..f837a527 100644
--- a/ethosu/regor/test/util.cpp
+++ b/ethosu/regor/test/util.cpp
@@ -177,6 +177,16 @@ std::shared_ptr<Operation> CreateOperation(OpType opType, TensorUsage ifmUsage,
     return op;
 }
 
+// Create a Operation with three inputs
+std::shared_ptr<Operation> CreateOperation(OpType opType, TensorUsage ifmUsage, std::shared_ptr<Tensor> &ifm,
+    TensorUsage ifm2Usage, std::shared_ptr<Tensor> &ifm2, TensorUsage ifm3Usage, std::shared_ptr<Tensor> &ifm3,
+    TensorUsage ofmUsage, std::shared_ptr<Tensor> &ofm)
+{
+    auto op = CreateOperation(opType, ifmUsage, ifm, ifm2Usage, ifm2, ofmUsage, ofm);
+    op->ConnectInput(ifm3Usage, ifm3).Set(Quantization::Unit());
+    return op;
+}
+
 // Helpers for Scheduler IR
 // -----------------------------
 // Create a SchedulerTensor with name, storageshape and datatype
diff --git a/ethosu/regor/test/util.hpp b/ethosu/regor/test/util.hpp
index 247329af..674907ce 100644
--- a/ethosu/regor/test/util.hpp
+++ b/ethosu/regor/test/util.hpp
@@ -81,6 +81,10 @@ std::shared_ptr<Operation> CreateOperation(OpType opType, TensorUsage ifmUsage,
 // Create a Operation with binary input
 std::shared_ptr<Operation> CreateOperation(OpType opType, TensorUsage ifmUsage, std::shared_ptr<Tensor> &ifm,
     TensorUsage ifm2Usage, std::shared_ptr<Tensor> &ifm2, TensorUsage ofmUsage, std::shared_ptr<Tensor> &ofm);
+// Create a Operation with three inputs
+std::shared_ptr<Operation> CreateOperation(OpType opType, TensorUsage ifmUsage, std::shared_ptr<Tensor> &ifm,
+    TensorUsage ifm2Usage, std::shared_ptr<Tensor> &ifm2, TensorUsage ifm3Usage, std::shared_ptr<Tensor> &ifm3,
+    TensorUsage ofmUsage, std::shared_ptr<Tensor> &ofm);
 
 // Helpers for Scheduler IR
 // -----------------------------
-- 
GitLab