From 14f2efcabfa41b064dab7dd85f525f7d8be1bbc3 Mon Sep 17 00:00:00 2001 From: Alexander Bengtsson Date: Mon, 13 Jan 2025 13:12:59 +0100 Subject: [PATCH] MLBEDSW-10227: Additional checks for rescale IFM-fusing - Add 3 missing checks when performing IFM rescale-fusing on binary elementwise operations. 1. Pass both rescale in/out dtypes and operation in/out dtypes to SupportsFusedRescale. All 4 are required to determine whether an operation can be fused. 2. When performing IFM-fusing, the fused tensor should not be in graph-outputs. 3. When checking whether binary elementwise operations can IFM-fuse The compiler must also check following for the second input: * input/output unsigned attributes * that the fused tensor is not in graph-output. Change-Id: I82ed1d07f14d48c70c8a94b9579be20200029f95 Signed-off-by: Alexander Bengtsson --- .../architecture/architecture_constraints.hpp | 4 +- .../ethosu55/ethos_u55_constraints.cpp | 22 +++++---- .../ethosu55/ethos_u55_constraints.hpp | 3 +- .../ethosu85/ethos_u85_constraints.cpp | 22 +++++---- .../ethosu85/ethos_u85_constraints.hpp | 3 +- ethosu/regor/compiler/graphir_optimiser.cpp | 46 +++++++++++-------- 6 files changed, 57 insertions(+), 43 deletions(-) diff --git a/ethosu/regor/architecture/architecture_constraints.hpp b/ethosu/regor/architecture/architecture_constraints.hpp index f1171ea8..c5e24beb 100644 --- a/ethosu/regor/architecture/architecture_constraints.hpp +++ b/ethosu/regor/architecture/architecture_constraints.hpp @@ -96,8 +96,8 @@ public: virtual ~IArchitectureConstraints() = default; virtual bool SupportsReverse(OpType opType, ReverseType reverseTypeMask) = 0; - virtual bool SupportsFusedRescale(OpType opType, TensorUsage tensorUsage, DataType fromType, DataType toType, - const Quantization &quantization) = 0; + virtual bool SupportsFusedRescale(OpType opType, TensorUsage tensorUsage, DataType rescaleFromType, + DataType rescaleToType, DataType opFromType, DataType opToType, const Quantization &quantization) = 0; virtual bool SupportsRescale(DataType fromType, DataType toType) = 0; virtual TransposeSupport SupportsTranspose(OpType opType, TransposeType transposeType) = 0; virtual bool SupportsAccumulatorSaveRestore() = 0; diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp index 435e5de7..a97ca2bf 100644 --- a/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp @@ -54,21 +54,22 @@ bool EthosU55Constraints::SupportsReverse(OpType opType, ReverseType reverseType return reverseTypeMask == ReverseType::None; } -bool EthosU55Constraints::SupportsFusedRescale( - OpType opType, TensorUsage tensorUsage, DataType fromType, DataType toType, const Quantization &quantization) +bool EthosU55Constraints::SupportsFusedRescale(OpType opType, TensorUsage tensorUsage, DataType rescaleFromType, + DataType rescaleToType, DataType opFromType, DataType opToType, const Quantization &quantization) { auto npuOp = ArchEthosU55::GetHWOp(opType); bool globalScale = quantization.scales.size() == 1; - int fromBits = DataTypeSizeBits(fromType); - int toBits = DataTypeSizeBits(toType); bool isUnitScale = quantization.IsUnitScale(); if ( tensorUsage == TensorUsage::IFM ) { + int fromBits = DataTypeSizeBits(rescaleFromType); + int toBits = DataTypeSizeBits(opToType); if ( npuOp == EthosU55NpuOp::Elementwise && globalScale ) { - bool fromTypeSupported = IsInteger(fromType) && (fromBits == 8 || fromBits == 16); - bool toTypeSupported = (IsInteger(toType) && (toBits == 8 || toBits == 16)) || toType == DataType::Int32; + bool fromTypeSupported = IsInteger(rescaleFromType) && (fromBits == 8 || fromBits == 16); + bool toTypeSupported = (IsInteger(opToType) && (toBits == 8 || toBits == 16)) || opToType == DataType::Int32; + // TODO MLBEDSW-10115: Support full 32-bit (advanced) rescale (with nonzero shift) // For now only allow 16-bit (simple) rescale auto &qs = quantization.scales.front(); @@ -76,10 +77,10 @@ bool EthosU55Constraints::SupportsFusedRescale( // Make sure the rescale can be done without clipping int64_t zp = quantization.zeroPoints.front(); - int64_t value = (zp < 0 ? int64_t(IntegerMax(fromType)) : IntegerMin(fromType)); + int64_t value = (zp < 0 ? int64_t(IntegerMax(rescaleFromType)) : IntegerMin(rescaleFromType)); value = value - zp; value = (value * qs.scale) >> qs.shift; - bool noClipping = value >= IntegerMin(toType) && value <= int64_t(IntegerMax(toType)); + bool noClipping = value >= IntegerMin(rescaleToType) && value <= int64_t(IntegerMax(rescaleToType)); if ( opType == OpType::Add || opType == OpType::Sub ) { @@ -94,6 +95,7 @@ bool EthosU55Constraints::SupportsFusedRescale( } else if ( tensorUsage == TensorUsage::OFM ) { + int fromBits = DataTypeSizeBits(opFromType); if ( npuOp == EthosU55NpuOp::Convolution || npuOp == EthosU55NpuOp::Depthwise || npuOp == EthosU55NpuOp::Pooling || npuOp == EthosU55NpuOp::VectorProduct ) { @@ -101,8 +103,8 @@ bool EthosU55Constraints::SupportsFusedRescale( } else if ( npuOp == EthosU55NpuOp::Elementwise && globalScale ) { - bool fromTypeSupported = (IsInteger(fromType) && (fromBits == 8 || fromBits == 16)) || fromType == DataType::Int32; - if ( fromType == DataType::Int32 ) + bool fromTypeSupported = (IsInteger(opFromType) && (fromBits == 8 || fromBits == 16)) || opFromType == DataType::Int32; + if ( opFromType == DataType::Int32 ) { // For 32-bit operations scale is not applied but shift is return quantization.scales.front().scale == 1; diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.hpp b/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.hpp index 045125f5..b0355f65 100644 --- a/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.hpp +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.hpp @@ -31,7 +31,8 @@ public: bool SupportsMatMul(OpType opType) override; TransposeSupport SupportsTranspose(OpType opType, TransposeType transposeType) override; bool SupportsReverse(OpType opType, ReverseType reverseTypeMask) override; - bool SupportsFusedRescale(OpType opType, TensorUsage tensorUsage, DataType fromType, DataType toType, const Quantization &quantization) override; + bool SupportsFusedRescale(OpType opType, TensorUsage tensorUsage, DataType rescaleFromType, DataType rescaleToType, + DataType opFromType, DataType opToType, const Quantization &quantization) override; bool SupportsRescale(DataType fromType, DataType toType) override; bool SupportsAccumulatorSaveRestore() override { return false; } bool SupportsGather(OpType opType) override; diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp index c64550aa..b24ebb40 100644 --- a/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp +++ b/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp @@ -83,31 +83,31 @@ bool EthosU85Constraints::SupportsReverse(OpType opType, ReverseType reverseType return true; } -bool EthosU85Constraints::SupportsFusedRescale( - OpType opType, TensorUsage tensorUsage, DataType fromType, DataType toType, const Quantization &quantization) +bool EthosU85Constraints::SupportsFusedRescale(OpType opType, TensorUsage tensorUsage, DataType rescaleFromType, + DataType rescaleToType, DataType opFromType, DataType opToType, const Quantization &quantization) { auto npuOp = ArchEthosU85::GetHWOp(opType); bool globalScale = quantization.scales.size() == 1; - int fromBits = DataTypeSizeBits(fromType); - int toBits = DataTypeSizeBits(toType); bool isUnitScale = quantization.IsUnitScale(); if ( tensorUsage == TensorUsage::IFM ) { + int fromBits = DataTypeSizeBits(rescaleFromType); + int toBits = DataTypeSizeBits(opToType); if ( npuOp == EthosU85NpuOp::Elementwise && globalScale ) { - bool fromTypeSupported = (IsInteger(fromType) && fromBits == 8) || fromType == DataType::Int16; - bool toTypeSupported = (IsInteger(toType) && (toBits == 8 || toBits == 16)) || toType == DataType::Int32; + bool fromTypeSupported = (IsInteger(rescaleFromType) && fromBits == 8) || rescaleFromType == DataType::Int16; + bool toTypeSupported = (IsInteger(opToType) && (toBits == 8 || toBits == 16)) || opToType == DataType::Int32; auto &qs = quantization.scales.front(); // Make sure shift is valid if ( qs.shift < 0 || qs.shift > 63 ) return false; // Make sure the rescale can be done without clipping int64_t zp = quantization.zeroPoints.front(); - int64_t value = (zp < 0 ? int64_t(IntegerMax(fromType)) : IntegerMin(fromType)); + int64_t value = (zp < 0 ? int64_t(IntegerMax(rescaleFromType)) : IntegerMin(rescaleFromType)); value = value - zp; value = (value * qs.scale) >> qs.shift; - bool noClipping = value >= IntegerMin(toType) && value <= int64_t(IntegerMax(toType)); + bool noClipping = value >= IntegerMin(rescaleToType) && value <= int64_t(IntegerMax(rescaleToType)); if ( opType == OpType::Div || opType == OpType::Mul ) { @@ -122,6 +122,8 @@ bool EthosU85Constraints::SupportsFusedRescale( } else if ( tensorUsage == TensorUsage::OFM ) { + int fromBits = DataTypeSizeBits(opFromType); + int toBits = DataTypeSizeBits(rescaleToType); if ( npuOp == EthosU85NpuOp::Convolution || npuOp == EthosU85NpuOp::Depthwise || npuOp == EthosU85NpuOp::Pooling || npuOp == EthosU85NpuOp::VectorProduct ) { @@ -134,8 +136,8 @@ bool EthosU85Constraints::SupportsFusedRescale( } else if ( npuOp == EthosU85NpuOp::Elementwise && globalScale ) { - bool fromTypeSupported = (IsInteger(fromType) && (fromBits == 8 || fromBits == 16)) || fromType == DataType::Int32; - if ( opType == OpType::Mul && fromTypeSupported && fromType == DataType::Int32 ) + bool fromTypeSupported = (IsInteger(opFromType) && (fromBits == 8 || fromBits == 16)) || opFromType == DataType::Int32; + if ( opType == OpType::Mul && fromTypeSupported && opFromType == DataType::Int32 ) { return quantization.scales.front().scale == 1; // Only shift supported } diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.hpp index bd939523..eab0a264 100644 --- a/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.hpp +++ b/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.hpp @@ -31,7 +31,8 @@ public: bool SupportsMatMul(OpType opType) override; TransposeSupport SupportsTranspose(OpType opType, TransposeType transposeType) override; bool SupportsReverse(OpType opType, ReverseType reverseTypeMask) override; - bool SupportsFusedRescale(OpType opType, TensorUsage tensorUsage, DataType fromType, DataType toType, const Quantization &quantization) override; + bool SupportsFusedRescale(OpType opType, TensorUsage tensorUsage, DataType rescaleFromType, DataType rescaleToType, + DataType opFromType, DataType opToType, const Quantization &quantization) override; bool SupportsRescale(DataType fromType, DataType toType) override; bool SupportsAccumulatorSaveRestore() override { return true; } bool SupportsGather(OpType opType) override; diff --git a/ethosu/regor/compiler/graphir_optimiser.cpp b/ethosu/regor/compiler/graphir_optimiser.cpp index 590bbc8a..eb103f68 100644 --- a/ethosu/regor/compiler/graphir_optimiser.cpp +++ b/ethosu/regor/compiler/graphir_optimiser.cpp @@ -860,12 +860,6 @@ Operation *GraphIrOptimiser::FuseRescale(Graph *const graph, Operation *const op auto ifmConn = operation->Input(TensorUsage::IFM); auto producer = ifmConn->tensor->Writers().size() == 1 ? ifmConn->tensor->Writers().front() : nullptr; - if ( IsTensorInVector(graph->Outputs(), ifmConn->tensor.get()) ) - { - // If ifm is graph output, fusing it not possible - return returnOp; - } - // Convert scales to have 0 shift if possible, since this can improve fusing for Ethos-U55/65 auto ConvertedScales = [](const TensorConnection *conn) { @@ -881,9 +875,11 @@ Operation *GraphIrOptimiser::FuseRescale(Graph *const graph, Operation *const op }; // Check if there is only one consumer of the output of the rescale and try to fuse to that operation. - // Note: For input fusing we cannot have an output zero point on the Rescale operation (since the + // Do not fuse if the connecting tensor is also a graph-output. + // Note (ZeroPoints): For input fusing we cannot have an output zero point on the Rescale operation (since the // zero point is applied before scaling on inputs), however input zero point is fine. - if ( ofmConn->tensor->Readers().size() == 1 && ofmConn->quantization.zeroPoints == Quantization::Unit().zeroPoints ) + if ( ofmConn->tensor->Readers().size() == 1 && ofmConn->quantization.zeroPoints == Quantization::Unit().zeroPoints && + !IsTensorInVector(graph->Outputs(), ofmConn->tensor.get()) ) { // Propagate rescaling to input of next op auto consumer = ofmConn->tensor->Readers().front(); @@ -898,36 +894,46 @@ Operation *GraphIrOptimiser::FuseRescale(Graph *const graph, Operation *const op { if ( ifm.second.tensor == ofmConn->tensor ) { + auto consumerOfmCon = consumer->Output(TensorUsage::OFM); // This is the input of the next operation that consumes the rescaled values, // check that this input does not already have scaling and that fusing is allowed // by the constraints of the architecture. if ( ifm.second.quantization.EqualScales(Quantization::Unit()) && _constraints->SupportsFusedRescale(consumer->Type(), TensorUsage::IFM, ifmConn->tensor->Type(), - ofmConn->tensor->Type(), ifmQuant) ) + ofmConn->tensor->Type(), ifm.second.tensor->Type(), consumerOfmCon->tensor->Type(), ifmQuant) ) { // If the consumer is a binary elementwise make sure that both inputs have // the same data type after fusing. if ( IsBinaryElementwise(consumer->Type()) ) { - auto otherIfmCon = consumer->Input( + auto consumerOtherIfmCon = consumer->Input( ofmConn->tensor.get() == consumer->IFM(0) ? TensorUsage::IFM1 : TensorUsage::IFM0); + auto otherProducer = - otherIfmCon->tensor->Writers().size() == 1 ? otherIfmCon->tensor->Writers().front() : nullptr; + consumerOtherIfmCon->tensor->Writers().size() == 1 ? + consumerOtherIfmCon->tensor->Writers().front() : + nullptr; + // Both ifms must have same type after fusing (no rescale fused to this input case) - bool sameType = otherIfmCon->tensor->Type() == ifmConn->tensor->Type(); + bool sameType = consumerOtherIfmCon->tensor->Type() == ifmConn->tensor->Type(); // Is there a Rescale for the other ifm if ( otherProducer && otherProducer->Type() == OpType::Rescale ) { // Check if the other ifm rescale can be fused + auto opSign = otherProducer->Attribute(); + bool otherFusedTensorInGraphOutputs = IsTensorInVector( + graph->Outputs(), consumerOtherIfmCon->tensor.get()); + bool otherRescaleUnsigned = opSign ? (opSign->input_unsigned || opSign->output_unsigned) : false; + auto otherIfmQuant = otherProducer->Input(TensorUsage::IFM)->quantization; otherIfmQuant.scales = ConvertedScales(otherProducer->Output(TensorUsage::OFM)); - - if ( otherIfmCon->quantization.EqualScales(Quantization::Unit()) && + if ( consumerOtherIfmCon->quantization.EqualScales(Quantization::Unit()) && !otherRescaleUnsigned && !otherFusedTensorInGraphOutputs && _constraints->SupportsFusedRescale(consumer->Type(), TensorUsage::IFM, - otherProducer->IFM(0)->Type(), otherProducer->OFM()->Type(), otherIfmQuant) ) + otherProducer->IFM(0)->Type(), otherProducer->OFM()->Type(), + consumerOtherIfmCon->tensor->Type(), consumerOfmCon->tensor->Type(), otherIfmQuant) ) { - // and if so that both ifms will have the same type, either when this rescale is - // fused or not. + // If other input can be fused, check that both ifms will have the same type, + // either when this rescale is fused or not. sameType = sameType || otherProducer->IFM(0)->Type() == ifmConn->tensor->Type(); } } @@ -954,8 +960,10 @@ Operation *GraphIrOptimiser::FuseRescale(Graph *const graph, Operation *const op if ( returnOp == operation && producer && producer->Output(TensorUsage::OFM)->quantization.EqualScales(Quantization::Unit()) && ifmConn->quantization.zeroPoints == Quantization::Unit().zeroPoints && - _constraints->SupportsFusedRescale( - producer->Type(), TensorUsage::OFM, producer->IFM(0)->Type(), ofmConn->tensor->Type(), ofmQuant) ) + // fused tensor cannot be in graph-outputs + !IsTensorInVector(graph->Outputs(), ifmConn->tensor.get()) && + _constraints->SupportsFusedRescale(producer->Type(), TensorUsage::OFM, ifmConn->tensor->Type(), + ofmConn->tensor->Type(), producer->IFM(0)->Type(), producer->OFM()->Type(), ofmQuant) ) { // Propagate rescaling to output of previous op producer->CopyOutput(TensorUsage::OFM, *ofmConn); -- GitLab