From f68df6c0ef0c56d397956e0d6d6924c195f0e1d3 Mon Sep 17 00:00:00 2001
From: Jacob Bohlin <jacob.bohlin@arm.com>
Date: Mon, 17 Feb 2025 09:23:58 +0000
Subject: [PATCH] MLBEDSW-10419 Fix resnet_v2_50_int8 performance regression

* Generic solution to reshape to 3D and always pad in W-dimension was
causing performance regressions due to limitations in utilizing NHCWB16
format. With this change the reshape solution is only used when
necessary.
* Also cleaned up the RewritePad function a bit and made it so
MemoryCopy is always used for applying the padding regardless of NPU.
Previously elementwise NOT was used for Ethos-U85.

Change-Id: I813d04caa165da4eb9586d220a0ff1554bb07083
Signed-off-by: Jacob Bohlin <jacob.bohlin@arm.com>
---
 .../ethos_u85_register_cs_generator.cpp       |  2 +-
 ethosu/regor/compiler/graphir_optimiser.cpp   | 95 ++++++++-----------
 ethosu/regor/compiler/graphir_optimiser.hpp   |  2 +-
 3 files changed, 43 insertions(+), 56 deletions(-)
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp
index c2f352b8..be116134 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp
@@ -1732,7 +1732,7 @@ void EthosU85RCSGenerator::GenerateCommon(const HLCStripe *stripe, bool useGloba
 {
     auto op = stripe->operation.get();
     int32_t scalarValue = 0;
-    bool isScalar = IsScalar(op->ifm[0], scalarValue) && IsElementwise(op->type);
+    bool isScalar = IsElementwise(op->type) && IsScalar(op->ifm[0], scalarValue);
     assert(stripe->opGroup != nullptr);
     EthosU85OpGroup *opGroup = static_cast<EthosU85OpGroup *>(stripe->opGroup);
     int ofmCb = opGroup->ChainingBuffer(op->ofm.uid);
diff --git a/ethosu/regor/compiler/graphir_optimiser.cpp b/ethosu/regor/compiler/graphir_optimiser.cpp
index 4442d402..ab633765 100644
--- a/ethosu/regor/compiler/graphir_optimiser.cpp
+++ b/ethosu/regor/compiler/graphir_optimiser.cpp
@@ -676,14 +676,10 @@ Operation *GraphIrOptimiser::RewriteRescale(Graph *const, Operation *const opera
 }
 
 Operation *GraphIrOptimiser::MakeFillOperation(TensorConnection *const ofmConn, const Shape &ofmShape,
-    const TensorSlice &ofmSlice, std::shared_ptr<Tensor> padTensor, OpType opType)
+    const TensorSlice &ofmSlice, std::shared_ptr<Tensor> padTensor)
 {
-    auto fillOp = std::make_shared<Operation>(opType);
-    auto &ifmConn = fillOp->ConnectInput(TensorUsage::IFM, padTensor);
-    if ( opType == OpType::MemoryCopy )
-    {
-        ifmConn.Set(ofmSlice.shape);
-    }
+    auto fillOp = std::make_shared<Operation>(OpType::MemoryCopy);
+    fillOp->ConnectInput(TensorUsage::IFM, padTensor).Set(ofmSlice.shape);
     fillOp->CopyOutput(TensorUsage::OFM, *ofmConn);
     fillOp->Output(TensorUsage::OFM)->Set(ofmShape).Set(ofmSlice).Set(RoundMode::NATURAL);
     return fillOp.get();
@@ -706,69 +702,60 @@ Operation *GraphIrOptimiser::RewritePad(Graph *const, Operation *const operation
         Shape paddingBefore = TensorToShape(paramsConn->tensor.get(), paramsConn->shape.Width(), 2, 0);
         Shape paddingAfter = TensorToShape(paramsConn->tensor.get(), paramsConn->shape.Width(), 2, 1);
 
-        OpType fillOpType;
         std::shared_ptr<Tensor> padTensor;
         DataType dataType = ofmConn->tensor->Type();
-        if ( _constraints->SupportsNot() )
+
+        // Find the largest required pad area and create a constant of that size filled
+        // with the padding value. Then memcopy slices of this tensor to the different
+        // axes to be padded.
+        int maxElements = 0;
+        for ( int axis = 0; axis < ofmShape.Size(); axis++ )
         {
-            // Native support for elementwise Not can be utilized to broadcast a single scalar value
-            // to the whole area to be filled.
-            fillOpType = OpType::Not;
-            padTensor = CreateConstTensor("pad_const", dataType, ~padConst);
+            int padElements = (ofmShape.Elements() / ofmShape[axis]) * std::max(paddingBefore[axis], paddingAfter[axis]);
+            maxElements = std::max(maxElements, padElements);
         }
-        else
-        {
-            // Fallback case - find the largest required pad area and create a constant of that size
-            // filled with the padding value. Then memcopy slices of this tensor to the different
-            // axes to be padded.
-            int maxElements = 0;
-            for ( int axis = 0; axis < ofmShape.Size(); axis++ )
-            {
-                int padElements = (ofmShape.Elements() / ofmShape[axis]) * std::max(paddingBefore[axis], paddingAfter[axis]);
-                maxElements = std::max(maxElements, padElements);
-            }
 
-            fillOpType = OpType::MemoryCopy;
-            int bits = DataTypeSizeBits(dataType);
-            // Mask out the bits from the original constant to force a zero extension regardless
-            // of signedness.
-            uint32_t fillPattern = uint32_t(padConst) & (~0u >> std::max(32 - bits, 0));
-            // Then replicate the bits from the original constant to the rest of the 32-bit value if needed.
-            // So for example the 8-bit value -2 (0xfe) is replicated to 0xfefefefe, while the 16-bit value
-            // -2 (0xfffe) becomes 0xfffefffe.
-            if ( bits < 16 )
-            {
-                fillPattern |= fillPattern << 8;
-            }
-            if ( bits < 32 )
-            {
-                fillPattern |= fillPattern << 16;
-            }
-            std::vector<uint32_t> buffer(DivRoundUp(DataTypeStorageSizeBytes(dataType, maxElements), 4), fillPattern);
-            const Shape padShape = Shape(maxElements);
-            padTensor = CreateConstTensor("pad_const", dataType, std::make_shared<Buffer>(std::move(buffer)), &padShape);
+        int bits = DataTypeSizeBits(dataType);
+        // Mask out the bits from the original constant to force a zero extension regardless
+        // of signedness.
+        uint32_t fillPattern = uint32_t(padConst) & (~0u >> std::max(32 - bits, 0));
+        // Then replicate the bits from the original constant to the rest of the 32-bit value if needed.
+        // So for example the 8-bit value -2 (0xfe) is replicated to 0xfefefefe, while the 16-bit value
+        // -2 (0xfffe) becomes 0xfffefffe.
+        if ( bits < 16 )
+        {
+            fillPattern |= fillPattern << 8;
         }
-
-        for ( int axis = 0; axis < ifmConn->shape.Size(); axis++ )
+        if ( bits < 32 )
+        {
+            fillPattern |= fillPattern << 16;
+        }
+        std::vector<uint32_t> buffer(DivRoundUp(DataTypeStorageSizeBytes(dataType, maxElements), 4), fillPattern);
+        const Shape padShape = Shape(maxElements);
+        padTensor = CreateConstTensor("pad_const", dataType, std::make_shared<Buffer>(std::move(buffer)), &padShape);
+
+        // Padding tensors of higher than rank 4 or rank 4 with a batch larger than 1 requires reshaping to a 3D shape
+        // (HWC) where W is the dimension to pad. Only use this strategy when necessary since it is often slower.
+        const Shape ifmShape = ifmConn->shape;
+        bool reshapeAndPadW = ifmShape.Size() > 4 || (ifmShape.Size() == 4 && ifmShape.Batch() > 1);
+        for ( int axis = 0; axis < ifmShape.Size(); axis++ )
         {
-            // Reshape the IFM/OFM/padding to a 3D shape (HWC) where W dimension is the dimension to pad
-            Shape newIfmShape = ReshapeTo3DAroundAxis(ifmConn->shape, axis);
-            Shape newOfmShape = ReshapeTo3DAroundAxis(ofmShape, axis);
-            Shape newPaddingBefore = ReshapeTo3DAroundAxis(paddingBefore, axis, 0);
+            Shape newOfmShape = reshapeAndPadW ? ReshapeTo3DAroundAxis(ofmShape, axis) : ofmShape;
+            int padAxis = reshapeAndPadW ? 1 : axis;
 
             const int padBefore = paddingBefore[axis];
             if ( padBefore )
             {
-                TensorSlice newOfmSlice = {newPaddingBefore.WithWidth(0), newOfmShape.WithWidth(padBefore)};
-                auto fillOp = MakeFillOperation(ofmConn, newOfmShape, newOfmSlice, padTensor, fillOpType);
+                TensorSlice newOfmSlice = {newOfmShape.WithZeros(), newOfmShape.With(padAxis, padBefore)};
+                auto fillOp = MakeFillOperation(ofmConn, newOfmShape, newOfmSlice, padTensor);
                 RecordOptimisation(operation, fillOp);
             }
 
             const int padAfter = paddingAfter[axis];
             if ( padAfter )
             {
-                TensorSlice newOfmSlice = {newPaddingBefore.WithWidth(padBefore + newIfmShape.Width()), newOfmShape.WithWidth(padAfter)};
-                auto fillOp = MakeFillOperation(ofmConn, newOfmShape, newOfmSlice, padTensor, fillOpType);
+                TensorSlice newOfmSlice = {newOfmShape.With(padAxis, newOfmShape[padAxis] - padAfter), newOfmShape.With(padAxis, padAfter)};
+                auto fillOp = MakeFillOperation(ofmConn, newOfmShape, newOfmSlice, padTensor);
                 RecordOptimisation(operation, fillOp);
             }
         }
@@ -777,7 +764,7 @@ Operation *GraphIrOptimiser::RewritePad(Graph *const, Operation *const operation
         auto copyOp = std::make_shared<Operation>(OpType::MemoryCopy);
         copyOp->CopyInput(TensorUsage::IFM, *ifmConn);
         copyOp->CopyOutput(TensorUsage::OFM, *ofmConn);
-        copyOp->Output(TensorUsage::OFM)->Set({paddingBefore, ifmConn->shape}).Set(RoundMode::NATURAL);
+        copyOp->Output(TensorUsage::OFM)->Set({paddingBefore, ifmShape}).Set(RoundMode::NATURAL);
         RecordOptimisation(operation, copyOp.get());
         returnOp = copyOp.get();
 
diff --git a/ethosu/regor/compiler/graphir_optimiser.hpp b/ethosu/regor/compiler/graphir_optimiser.hpp
index 8a14c700..e54590a9 100644
--- a/ethosu/regor/compiler/graphir_optimiser.hpp
+++ b/ethosu/regor/compiler/graphir_optimiser.hpp
@@ -75,7 +75,7 @@ private:
     Operation *UnrollConv(Graph *const, Operation *const operation);
     // Utility/Helper methods
     Operation *MakeFillOperation(TensorConnection *const ofmConn, const Shape &ofmShape, const TensorSlice &ofmSlice,
-        std::shared_ptr<Tensor> padTensor, OpType opType);
+        std::shared_ptr<Tensor> padTensor);
 
     // The graph optimisation steps.
     // Order matters, array of rewrites processed in order.
-- 
GitLab