diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp index c2f352b8b68e06d47b32717b28b9882165833215..be11613441aca0738fec19c9ba2de47391e73b6a 100644 --- a/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp +++ b/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp @@ -1732,7 +1732,7 @@ void EthosU85RCSGenerator::GenerateCommon(const HLCStripe *stripe, bool useGloba { auto op = stripe->operation.get(); int32_t scalarValue = 0; - bool isScalar = IsScalar(op->ifm[0], scalarValue) && IsElementwise(op->type); + bool isScalar = IsElementwise(op->type) && IsScalar(op->ifm[0], scalarValue); assert(stripe->opGroup != nullptr); EthosU85OpGroup *opGroup = static_cast(stripe->opGroup); int ofmCb = opGroup->ChainingBuffer(op->ofm.uid); diff --git a/ethosu/regor/compiler/graphir_optimiser.cpp b/ethosu/regor/compiler/graphir_optimiser.cpp index 4442d402c7951c8356f8e6ee0c4db8963a998854..ab6337653b836c175f31d0436af9706c57114264 100644 --- a/ethosu/regor/compiler/graphir_optimiser.cpp +++ b/ethosu/regor/compiler/graphir_optimiser.cpp @@ -676,14 +676,10 @@ Operation *GraphIrOptimiser::RewriteRescale(Graph *const, Operation *const opera } Operation *GraphIrOptimiser::MakeFillOperation(TensorConnection *const ofmConn, const Shape &ofmShape, - const TensorSlice &ofmSlice, std::shared_ptr padTensor, OpType opType) + const TensorSlice &ofmSlice, std::shared_ptr padTensor) { - auto fillOp = std::make_shared(opType); - auto &ifmConn = fillOp->ConnectInput(TensorUsage::IFM, padTensor); - if ( opType == OpType::MemoryCopy ) - { - ifmConn.Set(ofmSlice.shape); - } + auto fillOp = std::make_shared(OpType::MemoryCopy); + fillOp->ConnectInput(TensorUsage::IFM, padTensor).Set(ofmSlice.shape); fillOp->CopyOutput(TensorUsage::OFM, *ofmConn); fillOp->Output(TensorUsage::OFM)->Set(ofmShape).Set(ofmSlice).Set(RoundMode::NATURAL); return fillOp.get(); @@ -706,69 +702,60 @@ Operation *GraphIrOptimiser::RewritePad(Graph *const, Operation *const operation Shape paddingBefore = TensorToShape(paramsConn->tensor.get(), paramsConn->shape.Width(), 2, 0); Shape paddingAfter = TensorToShape(paramsConn->tensor.get(), paramsConn->shape.Width(), 2, 1); - OpType fillOpType; std::shared_ptr padTensor; DataType dataType = ofmConn->tensor->Type(); - if ( _constraints->SupportsNot() ) + + // Find the largest required pad area and create a constant of that size filled + // with the padding value. Then memcopy slices of this tensor to the different + // axes to be padded. + int maxElements = 0; + for ( int axis = 0; axis < ofmShape.Size(); axis++ ) { - // Native support for elementwise Not can be utilized to broadcast a single scalar value - // to the whole area to be filled. - fillOpType = OpType::Not; - padTensor = CreateConstTensor("pad_const", dataType, ~padConst); + int padElements = (ofmShape.Elements() / ofmShape[axis]) * std::max(paddingBefore[axis], paddingAfter[axis]); + maxElements = std::max(maxElements, padElements); } - else - { - // Fallback case - find the largest required pad area and create a constant of that size - // filled with the padding value. Then memcopy slices of this tensor to the different - // axes to be padded. - int maxElements = 0; - for ( int axis = 0; axis < ofmShape.Size(); axis++ ) - { - int padElements = (ofmShape.Elements() / ofmShape[axis]) * std::max(paddingBefore[axis], paddingAfter[axis]); - maxElements = std::max(maxElements, padElements); - } - fillOpType = OpType::MemoryCopy; - int bits = DataTypeSizeBits(dataType); - // Mask out the bits from the original constant to force a zero extension regardless - // of signedness. - uint32_t fillPattern = uint32_t(padConst) & (~0u >> std::max(32 - bits, 0)); - // Then replicate the bits from the original constant to the rest of the 32-bit value if needed. - // So for example the 8-bit value -2 (0xfe) is replicated to 0xfefefefe, while the 16-bit value - // -2 (0xfffe) becomes 0xfffefffe. - if ( bits < 16 ) - { - fillPattern |= fillPattern << 8; - } - if ( bits < 32 ) - { - fillPattern |= fillPattern << 16; - } - std::vector buffer(DivRoundUp(DataTypeStorageSizeBytes(dataType, maxElements), 4), fillPattern); - const Shape padShape = Shape(maxElements); - padTensor = CreateConstTensor("pad_const", dataType, std::make_shared(std::move(buffer)), &padShape); + int bits = DataTypeSizeBits(dataType); + // Mask out the bits from the original constant to force a zero extension regardless + // of signedness. + uint32_t fillPattern = uint32_t(padConst) & (~0u >> std::max(32 - bits, 0)); + // Then replicate the bits from the original constant to the rest of the 32-bit value if needed. + // So for example the 8-bit value -2 (0xfe) is replicated to 0xfefefefe, while the 16-bit value + // -2 (0xfffe) becomes 0xfffefffe. + if ( bits < 16 ) + { + fillPattern |= fillPattern << 8; } - - for ( int axis = 0; axis < ifmConn->shape.Size(); axis++ ) + if ( bits < 32 ) + { + fillPattern |= fillPattern << 16; + } + std::vector buffer(DivRoundUp(DataTypeStorageSizeBytes(dataType, maxElements), 4), fillPattern); + const Shape padShape = Shape(maxElements); + padTensor = CreateConstTensor("pad_const", dataType, std::make_shared(std::move(buffer)), &padShape); + + // Padding tensors of higher than rank 4 or rank 4 with a batch larger than 1 requires reshaping to a 3D shape + // (HWC) where W is the dimension to pad. Only use this strategy when necessary since it is often slower. + const Shape ifmShape = ifmConn->shape; + bool reshapeAndPadW = ifmShape.Size() > 4 || (ifmShape.Size() == 4 && ifmShape.Batch() > 1); + for ( int axis = 0; axis < ifmShape.Size(); axis++ ) { - // Reshape the IFM/OFM/padding to a 3D shape (HWC) where W dimension is the dimension to pad - Shape newIfmShape = ReshapeTo3DAroundAxis(ifmConn->shape, axis); - Shape newOfmShape = ReshapeTo3DAroundAxis(ofmShape, axis); - Shape newPaddingBefore = ReshapeTo3DAroundAxis(paddingBefore, axis, 0); + Shape newOfmShape = reshapeAndPadW ? ReshapeTo3DAroundAxis(ofmShape, axis) : ofmShape; + int padAxis = reshapeAndPadW ? 1 : axis; const int padBefore = paddingBefore[axis]; if ( padBefore ) { - TensorSlice newOfmSlice = {newPaddingBefore.WithWidth(0), newOfmShape.WithWidth(padBefore)}; - auto fillOp = MakeFillOperation(ofmConn, newOfmShape, newOfmSlice, padTensor, fillOpType); + TensorSlice newOfmSlice = {newOfmShape.WithZeros(), newOfmShape.With(padAxis, padBefore)}; + auto fillOp = MakeFillOperation(ofmConn, newOfmShape, newOfmSlice, padTensor); RecordOptimisation(operation, fillOp); } const int padAfter = paddingAfter[axis]; if ( padAfter ) { - TensorSlice newOfmSlice = {newPaddingBefore.WithWidth(padBefore + newIfmShape.Width()), newOfmShape.WithWidth(padAfter)}; - auto fillOp = MakeFillOperation(ofmConn, newOfmShape, newOfmSlice, padTensor, fillOpType); + TensorSlice newOfmSlice = {newOfmShape.With(padAxis, newOfmShape[padAxis] - padAfter), newOfmShape.With(padAxis, padAfter)}; + auto fillOp = MakeFillOperation(ofmConn, newOfmShape, newOfmSlice, padTensor); RecordOptimisation(operation, fillOp); } } @@ -777,7 +764,7 @@ Operation *GraphIrOptimiser::RewritePad(Graph *const, Operation *const operation auto copyOp = std::make_shared(OpType::MemoryCopy); copyOp->CopyInput(TensorUsage::IFM, *ifmConn); copyOp->CopyOutput(TensorUsage::OFM, *ofmConn); - copyOp->Output(TensorUsage::OFM)->Set({paddingBefore, ifmConn->shape}).Set(RoundMode::NATURAL); + copyOp->Output(TensorUsage::OFM)->Set({paddingBefore, ifmShape}).Set(RoundMode::NATURAL); RecordOptimisation(operation, copyOp.get()); returnOp = copyOp.get(); diff --git a/ethosu/regor/compiler/graphir_optimiser.hpp b/ethosu/regor/compiler/graphir_optimiser.hpp index 8a14c700f0f6c906249de593e6f8bffc09d82d78..e54590a9e2a098f7eee679293acc932577b37424 100644 --- a/ethosu/regor/compiler/graphir_optimiser.hpp +++ b/ethosu/regor/compiler/graphir_optimiser.hpp @@ -75,7 +75,7 @@ private: Operation *UnrollConv(Graph *const, Operation *const operation); // Utility/Helper methods Operation *MakeFillOperation(TensorConnection *const ofmConn, const Shape &ofmShape, const TensorSlice &ofmSlice, - std::shared_ptr padTensor, OpType opType); + std::shared_ptr padTensor); // The graph optimisation steps. // Order matters, array of rewrites processed in order.