diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp
index c2f352b8b68e06d47b32717b28b9882165833215..be11613441aca0738fec19c9ba2de47391e73b6a 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp
@@ -1732,7 +1732,7 @@ void EthosU85RCSGenerator::GenerateCommon(const HLCStripe *stripe, bool useGloba
 {
     auto op = stripe->operation.get();
     int32_t scalarValue = 0;
-    bool isScalar = IsScalar(op->ifm[0], scalarValue) && IsElementwise(op->type);
+    bool isScalar = IsElementwise(op->type) && IsScalar(op->ifm[0], scalarValue);
     assert(stripe->opGroup != nullptr);
     EthosU85OpGroup *opGroup = static_cast<EthosU85OpGroup *>(stripe->opGroup);
     int ofmCb = opGroup->ChainingBuffer(op->ofm.uid);
diff --git a/ethosu/regor/compiler/graphir_optimiser.cpp b/ethosu/regor/compiler/graphir_optimiser.cpp
index 4442d402c7951c8356f8e6ee0c4db8963a998854..ab6337653b836c175f31d0436af9706c57114264 100644
--- a/ethosu/regor/compiler/graphir_optimiser.cpp
+++ b/ethosu/regor/compiler/graphir_optimiser.cpp
@@ -676,14 +676,10 @@ Operation *GraphIrOptimiser::RewriteRescale(Graph *const, Operation *const opera
 }
 
 Operation *GraphIrOptimiser::MakeFillOperation(TensorConnection *const ofmConn, const Shape &ofmShape,
-    const TensorSlice &ofmSlice, std::shared_ptr<Tensor> padTensor, OpType opType)
+    const TensorSlice &ofmSlice, std::shared_ptr<Tensor> padTensor)
 {
-    auto fillOp = std::make_shared<Operation>(opType);
-    auto &ifmConn = fillOp->ConnectInput(TensorUsage::IFM, padTensor);
-    if ( opType == OpType::MemoryCopy )
-    {
-        ifmConn.Set(ofmSlice.shape);
-    }
+    auto fillOp = std::make_shared<Operation>(OpType::MemoryCopy);
+    fillOp->ConnectInput(TensorUsage::IFM, padTensor).Set(ofmSlice.shape);
     fillOp->CopyOutput(TensorUsage::OFM, *ofmConn);
     fillOp->Output(TensorUsage::OFM)->Set(ofmShape).Set(ofmSlice).Set(RoundMode::NATURAL);
     return fillOp.get();
@@ -706,69 +702,60 @@ Operation *GraphIrOptimiser::RewritePad(Graph *const, Operation *const operation
         Shape paddingBefore = TensorToShape(paramsConn->tensor.get(), paramsConn->shape.Width(), 2, 0);
         Shape paddingAfter = TensorToShape(paramsConn->tensor.get(), paramsConn->shape.Width(), 2, 1);
 
-        OpType fillOpType;
         std::shared_ptr<Tensor> padTensor;
         DataType dataType = ofmConn->tensor->Type();
-        if ( _constraints->SupportsNot() )
+
+        // Find the largest required pad area and create a constant of that size filled
+        // with the padding value. Then memcopy slices of this tensor to the different
+        // axes to be padded.
+        int maxElements = 0;
+        for ( int axis = 0; axis < ofmShape.Size(); axis++ )
         {
-            // Native support for elementwise Not can be utilized to broadcast a single scalar value
-            // to the whole area to be filled.
-            fillOpType = OpType::Not;
-            padTensor = CreateConstTensor("pad_const", dataType, ~padConst);
+            int padElements = (ofmShape.Elements() / ofmShape[axis]) * std::max(paddingBefore[axis], paddingAfter[axis]);
+            maxElements = std::max(maxElements, padElements);
         }
-        else
-        {
-            // Fallback case - find the largest required pad area and create a constant of that size
-            // filled with the padding value. Then memcopy slices of this tensor to the different
-            // axes to be padded.
-            int maxElements = 0;
-            for ( int axis = 0; axis < ofmShape.Size(); axis++ )
-            {
-                int padElements = (ofmShape.Elements() / ofmShape[axis]) * std::max(paddingBefore[axis], paddingAfter[axis]);
-                maxElements = std::max(maxElements, padElements);
-            }
 
-            fillOpType = OpType::MemoryCopy;
-            int bits = DataTypeSizeBits(dataType);
-            // Mask out the bits from the original constant to force a zero extension regardless
-            // of signedness.
-            uint32_t fillPattern = uint32_t(padConst) & (~0u >> std::max(32 - bits, 0));
-            // Then replicate the bits from the original constant to the rest of the 32-bit value if needed.
-            // So for example the 8-bit value -2 (0xfe) is replicated to 0xfefefefe, while the 16-bit value
-            // -2 (0xfffe) becomes 0xfffefffe.
-            if ( bits < 16 )
-            {
-                fillPattern |= fillPattern << 8;
-            }
-            if ( bits < 32 )
-            {
-                fillPattern |= fillPattern << 16;
-            }
-            std::vector<uint32_t> buffer(DivRoundUp(DataTypeStorageSizeBytes(dataType, maxElements), 4), fillPattern);
-            const Shape padShape = Shape(maxElements);
-            padTensor = CreateConstTensor("pad_const", dataType, std::make_shared<Buffer>(std::move(buffer)), &padShape);
+        int bits = DataTypeSizeBits(dataType);
+        // Mask out the bits from the original constant to force a zero extension regardless
+        // of signedness.
+        uint32_t fillPattern = uint32_t(padConst) & (~0u >> std::max(32 - bits, 0));
+        // Then replicate the bits from the original constant to the rest of the 32-bit value if needed.
+        // So for example the 8-bit value -2 (0xfe) is replicated to 0xfefefefe, while the 16-bit value
+        // -2 (0xfffe) becomes 0xfffefffe.
+        if ( bits < 16 )
+        {
+            fillPattern |= fillPattern << 8;
         }
-
-        for ( int axis = 0; axis < ifmConn->shape.Size(); axis++ )
+        if ( bits < 32 )
+        {
+            fillPattern |= fillPattern << 16;
+        }
+        std::vector<uint32_t> buffer(DivRoundUp(DataTypeStorageSizeBytes(dataType, maxElements), 4), fillPattern);
+        const Shape padShape = Shape(maxElements);
+        padTensor = CreateConstTensor("pad_const", dataType, std::make_shared<Buffer>(std::move(buffer)), &padShape);
+
+        // Padding tensors of higher than rank 4 or rank 4 with a batch larger than 1 requires reshaping to a 3D shape
+        // (HWC) where W is the dimension to pad. Only use this strategy when necessary since it is often slower.
+        const Shape ifmShape = ifmConn->shape;
+        bool reshapeAndPadW = ifmShape.Size() > 4 || (ifmShape.Size() == 4 && ifmShape.Batch() > 1);
+        for ( int axis = 0; axis < ifmShape.Size(); axis++ )
         {
-            // Reshape the IFM/OFM/padding to a 3D shape (HWC) where W dimension is the dimension to pad
-            Shape newIfmShape = ReshapeTo3DAroundAxis(ifmConn->shape, axis);
-            Shape newOfmShape = ReshapeTo3DAroundAxis(ofmShape, axis);
-            Shape newPaddingBefore = ReshapeTo3DAroundAxis(paddingBefore, axis, 0);
+            Shape newOfmShape = reshapeAndPadW ? ReshapeTo3DAroundAxis(ofmShape, axis) : ofmShape;
+            int padAxis = reshapeAndPadW ? 1 : axis;
 
             const int padBefore = paddingBefore[axis];
             if ( padBefore )
             {
-                TensorSlice newOfmSlice = {newPaddingBefore.WithWidth(0), newOfmShape.WithWidth(padBefore)};
-                auto fillOp = MakeFillOperation(ofmConn, newOfmShape, newOfmSlice, padTensor, fillOpType);
+                TensorSlice newOfmSlice = {newOfmShape.WithZeros(), newOfmShape.With(padAxis, padBefore)};
+                auto fillOp = MakeFillOperation(ofmConn, newOfmShape, newOfmSlice, padTensor);
                 RecordOptimisation(operation, fillOp);
             }
 
             const int padAfter = paddingAfter[axis];
             if ( padAfter )
             {
-                TensorSlice newOfmSlice = {newPaddingBefore.WithWidth(padBefore + newIfmShape.Width()), newOfmShape.WithWidth(padAfter)};
-                auto fillOp = MakeFillOperation(ofmConn, newOfmShape, newOfmSlice, padTensor, fillOpType);
+                TensorSlice newOfmSlice = {newOfmShape.With(padAxis, newOfmShape[padAxis] - padAfter), newOfmShape.With(padAxis, padAfter)};
+                auto fillOp = MakeFillOperation(ofmConn, newOfmShape, newOfmSlice, padTensor);
                 RecordOptimisation(operation, fillOp);
             }
         }
@@ -777,7 +764,7 @@ Operation *GraphIrOptimiser::RewritePad(Graph *const, Operation *const operation
         auto copyOp = std::make_shared<Operation>(OpType::MemoryCopy);
         copyOp->CopyInput(TensorUsage::IFM, *ifmConn);
         copyOp->CopyOutput(TensorUsage::OFM, *ofmConn);
-        copyOp->Output(TensorUsage::OFM)->Set({paddingBefore, ifmConn->shape}).Set(RoundMode::NATURAL);
+        copyOp->Output(TensorUsage::OFM)->Set({paddingBefore, ifmShape}).Set(RoundMode::NATURAL);
         RecordOptimisation(operation, copyOp.get());
         returnOp = copyOp.get();
 
diff --git a/ethosu/regor/compiler/graphir_optimiser.hpp b/ethosu/regor/compiler/graphir_optimiser.hpp
index 8a14c700f0f6c906249de593e6f8bffc09d82d78..e54590a9e2a098f7eee679293acc932577b37424 100644
--- a/ethosu/regor/compiler/graphir_optimiser.hpp
+++ b/ethosu/regor/compiler/graphir_optimiser.hpp
@@ -75,7 +75,7 @@ private:
     Operation *UnrollConv(Graph *const, Operation *const operation);
     // Utility/Helper methods
     Operation *MakeFillOperation(TensorConnection *const ofmConn, const Shape &ofmShape, const TensorSlice &ofmSlice,
-        std::shared_ptr<Tensor> padTensor, OpType opType);
+        std::shared_ptr<Tensor> padTensor);
 
     // The graph optimisation steps.
     // Order matters, array of rewrites processed in order.