From af5f7df9748be6943c5bdd49e47e1568e53712d6 Mon Sep 17 00:00:00 2001
From: Jacob Bohlin <jacob.bohlin@arm.com>
Date: Tue, 21 Jan 2025 09:21:13 +0000
Subject: [PATCH] MLBEDSW-10274 Add TOSA Pad support for Ethos-U55/U65

Change-Id: I91e1fdc69807b0a8702663932944b327f4728a1e
Signed-off-by: Jacob Bohlin <jacob.bohlin@arm.com>
---
 .../architecture/architecture_constraints.hpp |  1 +
 .../ethosu55/ethos_u55_constraints.hpp        |  1 +
 .../ethosu85/ethos_u85_constraints.hpp        |  1 +
 ethosu/regor/compiler/graphir_optimiser.cpp   | 88 ++++++++++++++-----
 ethosu/regor/compiler/graphir_optimiser.hpp   |  6 +-
 5 files changed, 74 insertions(+), 23 deletions(-)
diff --git a/ethosu/regor/architecture/architecture_constraints.hpp b/ethosu/regor/architecture/architecture_constraints.hpp
index c5e24beb..dab6adc1 100644
--- a/ethosu/regor/architecture/architecture_constraints.hpp
+++ b/ethosu/regor/architecture/architecture_constraints.hpp
@@ -103,6 +103,7 @@ public:
     virtual bool SupportsAccumulatorSaveRestore() = 0;
     virtual bool SupportsLeakyRelu(bool quantized, DataType type) = 0;
     virtual bool SupportsNegativeStrides() = 0;
+    virtual bool SupportsNot() = 0;
 
     bool CanExecute(const ExecutionQuery &query)
     {
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.hpp b/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.hpp
index b0355f65..b091ee5a 100644
--- a/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.hpp
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.hpp
@@ -43,6 +43,7 @@ public:
     bool SupportsCast(OpType opType, DataType ifmType, DataType ofmType) override;
     bool SupportsNonMatchingShapes(const Shape &ifmShape, const Shape &ifm2Shape, const Shape &ofmShape) override;
     bool SupportsNegativeStrides() override { return true; };
+    bool SupportsNot() override { return false; };
 
 private:
     ArchEthosU55 *_arch;
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.hpp
index eab0a264..fcd6a369 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.hpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.hpp
@@ -43,6 +43,7 @@ public:
     bool SupportsCast(OpType opType, DataType ifmType, DataType ofmType) override;
     bool SupportsNonMatchingShapes(const Shape &ifmShape, const Shape &ifm2Shape, const Shape &ofmShape) override;
     bool SupportsNegativeStrides() override { return false; };
+    bool SupportsNot() override { return true; };
 
 private:
     ArchEthosU85 *_arch;
diff --git a/ethosu/regor/compiler/graphir_optimiser.cpp b/ethosu/regor/compiler/graphir_optimiser.cpp
index 30add616..5236b0c3 100644
--- a/ethosu/regor/compiler/graphir_optimiser.cpp
+++ b/ethosu/regor/compiler/graphir_optimiser.cpp
@@ -675,7 +675,20 @@ Operation *GraphIrOptimiser::RewriteRescale(Graph *const, Operation *const opera
     return returnOp;
 }
 
-// Rewrite TOSA PAD to number of MemoryCopy ops
+Operation *GraphIrOptimiser::MakeFillOperation(TensorConnection *const ofmConn, const Shape &ofmShape,
+    const TensorSlice &ofmSlice, std::shared_ptr<Tensor> padTensor, OpType opType)
+{
+    auto fillOp = std::make_shared<Operation>(opType);
+    auto &ifmConn = fillOp->ConnectInput(TensorUsage::IFM, padTensor);
+    if ( opType == OpType::MemoryCopy )
+    {
+        ifmConn.Set(ofmSlice.shape);
+    }
+    fillOp->CopyOutput(TensorUsage::OFM, *ofmConn);
+    fillOp->Output(TensorUsage::OFM)->Set(ofmShape).Set(ofmSlice).Set(RoundMode::NATURAL);
+    return fillOp.get();
+}
+
 Operation *GraphIrOptimiser::RewritePad(Graph *const, Operation *const operation)
 {
     Operation *returnOp = operation;
@@ -684,48 +697,79 @@ Operation *GraphIrOptimiser::RewritePad(Graph *const, Operation *const operation
     {
         const auto &ifmConn = operation->Input(TensorUsage::IFM0);
         const auto &ofmConn = operation->Output(TensorUsage::OFM);
+        const Shape ofmShape = ofmConn->shape;
         const auto &paramsConn = operation->Input(TensorUsage::Params);
         const auto &attr = operation->Attribute<pad_attr_t>();
-        const double pad_const = attr->pad_const;
-        const int not_pad_const = ~int(pad_const);
+        const int padConst = int(attr->pad_const);
 
         // Decode the padding before and after each dimension as two shapes
         Shape paddingBefore = TensorToShape(paramsConn->tensor.get(), paramsConn->shape.Width(), 2, 0);
         Shape paddingAfter = TensorToShape(paramsConn->tensor.get(), paramsConn->shape.Width(), 2, 1);
 
+        OpType fillOpType;
+        std::shared_ptr<Tensor> padTensor;
+        DataType dataType = ofmConn->tensor->Type();
+        if ( _constraints->SupportsNot() )
+        {
+            // Native support for elementwise Not can be utilized to broadcast a single scalar value
+            // to the whole area to be filled.
+            fillOpType = OpType::Not;
+            padTensor = CreateConstTensor("pad_const", dataType, ~padConst);
+        }
+        else
+        {
+            // Fallback case - find the largest required pad area and create a constant of that size
+            // filled with the padding value. Then memcopy slices of this tensor to the different
+            // axes to be padded.
+            int maxElements = 0;
+            for ( int axis = 0; axis < ofmShape.Size(); axis++ )
+            {
+                int padElements = (ofmShape.Elements() / ofmShape[axis]) * std::max(paddingBefore[axis], paddingAfter[axis]);
+                maxElements = std::max(maxElements, padElements);
+            }
+
+            fillOpType = OpType::MemoryCopy;
+            int bits = DataTypeSizeBits(dataType);
+            // Mask out the bits from the original constant to force a zero extension regardless
+            // of signedness.
+            uint32_t fillPattern = uint32_t(padConst) & (~0u >> std::max(32 - bits, 0));
+            // Then replicate the bits from the original constant to the rest of the 32-bit value if needed.
+            // So for example the 8-bit value -2 (0xfe) is replicated to 0xfefefefe, while the 16-bit value
+            // -2 (0xfffe) becomes 0xfffefffe.
+            if ( bits < 16 )
+            {
+                fillPattern |= fillPattern << 8;
+            }
+            if ( bits < 32 )
+            {
+                fillPattern |= fillPattern << 16;
+            }
+            std::vector<uint32_t> buffer(DivRoundUp(DataTypeStorageSizeBytes(dataType, maxElements), 4), fillPattern);
+            const Shape padShape = Shape(maxElements);
+            padTensor = CreateConstTensor("pad_const", dataType, std::make_shared<Buffer>(std::move(buffer)), &padShape);
+        }
+
         for ( int axis = 0; axis < ifmConn->shape.Size(); axis++ )
         {
             // Reshape the IFM/OFM/padding to a 3D shape (HWC) where W dimension is the dimension to pad
             Shape newIfmShape = ReshapeTo3DAroundAxis(ifmConn->shape, axis);
-            Shape newOfmShape = ReshapeTo3DAroundAxis(ofmConn->shape, axis);
+            Shape newOfmShape = ReshapeTo3DAroundAxis(ofmShape, axis);
             Shape newPaddingBefore = ReshapeTo3DAroundAxis(paddingBefore, axis, 0);
 
             const int padBefore = paddingBefore[axis];
             if ( padBefore )
             {
-                Shape newOfmSliceOffset = newPaddingBefore.WithWidth(0);
-                Shape newOfmSliceShape = newOfmShape.WithWidth(padBefore);
-
-                // Fill padded elements with pad_const
-                auto fillOp = std::make_shared<Operation>(OpType::Not);
-                fillOp->ConnectInput(TensorUsage::IFM, CreateConstTensor("pad_const", ifmConn->tensor->Type(), not_pad_const));
-                fillOp->CopyOutput(TensorUsage::OFM, *ofmConn);
-                fillOp->Output(TensorUsage::OFM)->Set(newOfmShape).Set({newOfmSliceOffset, newOfmSliceShape}).Set(RoundMode::NATURAL);
-                RecordOptimisation(operation, fillOp.get());
+                TensorSlice newOfmSlice = {newPaddingBefore.WithWidth(0), newOfmShape.WithWidth(padBefore)};
+                auto fillOp = MakeFillOperation(ofmConn, newOfmShape, newOfmSlice, padTensor, fillOpType);
+                RecordOptimisation(operation, fillOp);
             }
 
             const int padAfter = paddingAfter[axis];
             if ( padAfter )
             {
-                Shape newOfmSliceOffset = newPaddingBefore.WithWidth(padBefore + newIfmShape.Width());
-                Shape newOfmSliceShape = newOfmShape.WithWidth(padAfter);
-
-                // Fill padded elements with pad_const
-                auto fillOp = std::make_shared<Operation>(OpType::Not);
-                fillOp->ConnectInput(TensorUsage::IFM, CreateConstTensor("pad_const", ifmConn->tensor->Type(), not_pad_const));
-                fillOp->CopyOutput(TensorUsage::OFM, *ofmConn);
-                fillOp->Output(TensorUsage::OFM)->Set(newOfmShape).Set({newOfmSliceOffset, newOfmSliceShape}).Set(RoundMode::NATURAL);
-                RecordOptimisation(operation, fillOp.get());
+                TensorSlice newOfmSlice = {newPaddingBefore.WithWidth(padBefore + newIfmShape.Width()), newOfmShape.WithWidth(padAfter)};
+                auto fillOp = MakeFillOperation(ofmConn, newOfmShape, newOfmSlice, padTensor, fillOpType);
+                RecordOptimisation(operation, fillOp);
             }
         }
 
diff --git a/ethosu/regor/compiler/graphir_optimiser.hpp b/ethosu/regor/compiler/graphir_optimiser.hpp
index 86dec4a3..8a14c700 100644
--- a/ethosu/regor/compiler/graphir_optimiser.hpp
+++ b/ethosu/regor/compiler/graphir_optimiser.hpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -73,6 +73,10 @@ private:
     void MoveToConsumer(const Operation *const operation, Operation *const cons);
     Operation *MoveSplitSliceToConsumer(Graph *const, Operation *const operation);
     Operation *UnrollConv(Graph *const, Operation *const operation);
+    // Utility/Helper methods
+    Operation *MakeFillOperation(TensorConnection *const ofmConn, const Shape &ofmShape, const TensorSlice &ofmSlice,
+        std::shared_ptr<Tensor> padTensor, OpType opType);
+
     // The graph optimisation steps.
     // Order matters, array of rewrites processed in order.
     // clang-format off
-- 
GitLab