From f68df6c0ef0c56d397956e0d6d6924c195f0e1d3 Mon Sep 17 00:00:00 2001 From: Jacob Bohlin Date: Mon, 17 Feb 2025 09:23:58 +0000 Subject: [PATCH] MLBEDSW-10419 Fix resnet_v2_50_int8 performance regression * Generic solution to reshape to 3D and always pad in W-dimension was causing performance regressions due to limitations in utilizing NHCWB16 format. With this change the reshape solution is only used when necessary. * Also cleaned up the RewritePad function a bit and made it so MemoryCopy is always used for applying the padding regardless of NPU. Previously elementwise NOT was used for Ethos-U85. Change-Id: I813d04caa165da4eb9586d220a0ff1554bb07083 Signed-off-by: Jacob Bohlin --- .../ethos_u85_register_cs_generator.cpp | 2 +- ethosu/regor/compiler/graphir_optimiser.cpp | 95 ++++++++----------- ethosu/regor/compiler/graphir_optimiser.hpp | 2 +- 3 files changed, 43 insertions(+), 56 deletions(-) diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp index c2f352b8..be116134 100644 --- a/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp +++ b/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp @@ -1732,7 +1732,7 @@ void EthosU85RCSGenerator::GenerateCommon(const HLCStripe *stripe, bool useGloba { auto op = stripe->operation.get(); int32_t scalarValue = 0; - bool isScalar = IsScalar(op->ifm[0], scalarValue) && IsElementwise(op->type); + bool isScalar = IsElementwise(op->type) && IsScalar(op->ifm[0], scalarValue); assert(stripe->opGroup != nullptr); EthosU85OpGroup *opGroup = static_cast(stripe->opGroup); int ofmCb = opGroup->ChainingBuffer(op->ofm.uid); diff --git a/ethosu/regor/compiler/graphir_optimiser.cpp b/ethosu/regor/compiler/graphir_optimiser.cpp index 4442d402..ab633765 100644 --- a/ethosu/regor/compiler/graphir_optimiser.cpp +++ b/ethosu/regor/compiler/graphir_optimiser.cpp @@ -676,14 +676,10 @@ Operation *GraphIrOptimiser::RewriteRescale(Graph *const, Operation *const opera } Operation *GraphIrOptimiser::MakeFillOperation(TensorConnection *const ofmConn, const Shape &ofmShape, - const TensorSlice &ofmSlice, std::shared_ptr padTensor, OpType opType) + const TensorSlice &ofmSlice, std::shared_ptr padTensor) { - auto fillOp = std::make_shared(opType); - auto &ifmConn = fillOp->ConnectInput(TensorUsage::IFM, padTensor); - if ( opType == OpType::MemoryCopy ) - { - ifmConn.Set(ofmSlice.shape); - } + auto fillOp = std::make_shared(OpType::MemoryCopy); + fillOp->ConnectInput(TensorUsage::IFM, padTensor).Set(ofmSlice.shape); fillOp->CopyOutput(TensorUsage::OFM, *ofmConn); fillOp->Output(TensorUsage::OFM)->Set(ofmShape).Set(ofmSlice).Set(RoundMode::NATURAL); return fillOp.get(); @@ -706,69 +702,60 @@ Operation *GraphIrOptimiser::RewritePad(Graph *const, Operation *const operation Shape paddingBefore = TensorToShape(paramsConn->tensor.get(), paramsConn->shape.Width(), 2, 0); Shape paddingAfter = TensorToShape(paramsConn->tensor.get(), paramsConn->shape.Width(), 2, 1); - OpType fillOpType; std::shared_ptr padTensor; DataType dataType = ofmConn->tensor->Type(); - if ( _constraints->SupportsNot() ) + + // Find the largest required pad area and create a constant of that size filled + // with the padding value. Then memcopy slices of this tensor to the different + // axes to be padded. + int maxElements = 0; + for ( int axis = 0; axis < ofmShape.Size(); axis++ ) { - // Native support for elementwise Not can be utilized to broadcast a single scalar value - // to the whole area to be filled. - fillOpType = OpType::Not; - padTensor = CreateConstTensor("pad_const", dataType, ~padConst); + int padElements = (ofmShape.Elements() / ofmShape[axis]) * std::max(paddingBefore[axis], paddingAfter[axis]); + maxElements = std::max(maxElements, padElements); } - else - { - // Fallback case - find the largest required pad area and create a constant of that size - // filled with the padding value. Then memcopy slices of this tensor to the different - // axes to be padded. - int maxElements = 0; - for ( int axis = 0; axis < ofmShape.Size(); axis++ ) - { - int padElements = (ofmShape.Elements() / ofmShape[axis]) * std::max(paddingBefore[axis], paddingAfter[axis]); - maxElements = std::max(maxElements, padElements); - } - fillOpType = OpType::MemoryCopy; - int bits = DataTypeSizeBits(dataType); - // Mask out the bits from the original constant to force a zero extension regardless - // of signedness. - uint32_t fillPattern = uint32_t(padConst) & (~0u >> std::max(32 - bits, 0)); - // Then replicate the bits from the original constant to the rest of the 32-bit value if needed. - // So for example the 8-bit value -2 (0xfe) is replicated to 0xfefefefe, while the 16-bit value - // -2 (0xfffe) becomes 0xfffefffe. - if ( bits < 16 ) - { - fillPattern |= fillPattern << 8; - } - if ( bits < 32 ) - { - fillPattern |= fillPattern << 16; - } - std::vector buffer(DivRoundUp(DataTypeStorageSizeBytes(dataType, maxElements), 4), fillPattern); - const Shape padShape = Shape(maxElements); - padTensor = CreateConstTensor("pad_const", dataType, std::make_shared(std::move(buffer)), &padShape); + int bits = DataTypeSizeBits(dataType); + // Mask out the bits from the original constant to force a zero extension regardless + // of signedness. + uint32_t fillPattern = uint32_t(padConst) & (~0u >> std::max(32 - bits, 0)); + // Then replicate the bits from the original constant to the rest of the 32-bit value if needed. + // So for example the 8-bit value -2 (0xfe) is replicated to 0xfefefefe, while the 16-bit value + // -2 (0xfffe) becomes 0xfffefffe. + if ( bits < 16 ) + { + fillPattern |= fillPattern << 8; } - - for ( int axis = 0; axis < ifmConn->shape.Size(); axis++ ) + if ( bits < 32 ) + { + fillPattern |= fillPattern << 16; + } + std::vector buffer(DivRoundUp(DataTypeStorageSizeBytes(dataType, maxElements), 4), fillPattern); + const Shape padShape = Shape(maxElements); + padTensor = CreateConstTensor("pad_const", dataType, std::make_shared(std::move(buffer)), &padShape); + + // Padding tensors of higher than rank 4 or rank 4 with a batch larger than 1 requires reshaping to a 3D shape + // (HWC) where W is the dimension to pad. Only use this strategy when necessary since it is often slower. + const Shape ifmShape = ifmConn->shape; + bool reshapeAndPadW = ifmShape.Size() > 4 || (ifmShape.Size() == 4 && ifmShape.Batch() > 1); + for ( int axis = 0; axis < ifmShape.Size(); axis++ ) { - // Reshape the IFM/OFM/padding to a 3D shape (HWC) where W dimension is the dimension to pad - Shape newIfmShape = ReshapeTo3DAroundAxis(ifmConn->shape, axis); - Shape newOfmShape = ReshapeTo3DAroundAxis(ofmShape, axis); - Shape newPaddingBefore = ReshapeTo3DAroundAxis(paddingBefore, axis, 0); + Shape newOfmShape = reshapeAndPadW ? ReshapeTo3DAroundAxis(ofmShape, axis) : ofmShape; + int padAxis = reshapeAndPadW ? 1 : axis; const int padBefore = paddingBefore[axis]; if ( padBefore ) { - TensorSlice newOfmSlice = {newPaddingBefore.WithWidth(0), newOfmShape.WithWidth(padBefore)}; - auto fillOp = MakeFillOperation(ofmConn, newOfmShape, newOfmSlice, padTensor, fillOpType); + TensorSlice newOfmSlice = {newOfmShape.WithZeros(), newOfmShape.With(padAxis, padBefore)}; + auto fillOp = MakeFillOperation(ofmConn, newOfmShape, newOfmSlice, padTensor); RecordOptimisation(operation, fillOp); } const int padAfter = paddingAfter[axis]; if ( padAfter ) { - TensorSlice newOfmSlice = {newPaddingBefore.WithWidth(padBefore + newIfmShape.Width()), newOfmShape.WithWidth(padAfter)}; - auto fillOp = MakeFillOperation(ofmConn, newOfmShape, newOfmSlice, padTensor, fillOpType); + TensorSlice newOfmSlice = {newOfmShape.With(padAxis, newOfmShape[padAxis] - padAfter), newOfmShape.With(padAxis, padAfter)}; + auto fillOp = MakeFillOperation(ofmConn, newOfmShape, newOfmSlice, padTensor); RecordOptimisation(operation, fillOp); } } @@ -777,7 +764,7 @@ Operation *GraphIrOptimiser::RewritePad(Graph *const, Operation *const operation auto copyOp = std::make_shared(OpType::MemoryCopy); copyOp->CopyInput(TensorUsage::IFM, *ifmConn); copyOp->CopyOutput(TensorUsage::OFM, *ofmConn); - copyOp->Output(TensorUsage::OFM)->Set({paddingBefore, ifmConn->shape}).Set(RoundMode::NATURAL); + copyOp->Output(TensorUsage::OFM)->Set({paddingBefore, ifmShape}).Set(RoundMode::NATURAL); RecordOptimisation(operation, copyOp.get()); returnOp = copyOp.get(); diff --git a/ethosu/regor/compiler/graphir_optimiser.hpp b/ethosu/regor/compiler/graphir_optimiser.hpp index 8a14c700..e54590a9 100644 --- a/ethosu/regor/compiler/graphir_optimiser.hpp +++ b/ethosu/regor/compiler/graphir_optimiser.hpp @@ -75,7 +75,7 @@ private: Operation *UnrollConv(Graph *const, Operation *const operation); // Utility/Helper methods Operation *MakeFillOperation(TensorConnection *const ofmConn, const Shape &ofmShape, const TensorSlice &ofmSlice, - std::shared_ptr padTensor, OpType opType); + std::shared_ptr padTensor); // The graph optimisation steps. // Order matters, array of rewrites processed in order. -- GitLab