From 48cd7c60cc184d6ce35e6efa85ab7513c8459d7a Mon Sep 17 00:00:00 2001 From: Rickard Bolin Date: Thu, 19 Dec 2024 16:14:53 +0000 Subject: [PATCH] MLBEDSW-10199: Normalize quantization before checking OFM Rescale fusing On Ethos-U55 and Ethos-U65, only Rescales with unit scale can be fused to Minimum operators. However, the output quantization shift was not normalized to zero before checking if the scale was unit or not. Change-Id: If8e04034045853e0c7b715547b2a68609c11cbe1 Signed-off-by: Rickard Bolin --- ethosu/regor/compiler/graphir_optimiser.cpp | 53 ++++++++++++--------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/ethosu/regor/compiler/graphir_optimiser.cpp b/ethosu/regor/compiler/graphir_optimiser.cpp index 24bab1b9..aafa0017 100644 --- a/ethosu/regor/compiler/graphir_optimiser.cpp +++ b/ethosu/regor/compiler/graphir_optimiser.cpp @@ -833,31 +833,34 @@ Operation *GraphIrOptimiser::FuseRescale(Graph *const graph, Operation *const op return returnOp; } + // Convert scales to have 0 shift if possible, since this can improve fusing for Ethos-U55/65 + auto ConvertedScales = [](const TensorConnection *conn) + { + auto scales = conn->quantization.scales; + for ( auto &qs : scales ) + { + if ( qs.shift > 0 && qs.shift < 31 && (qs.scale % (1 << qs.shift)) == 0 ) + { + qs = {(qs.scale >> qs.shift), 0}; + } + } + return scales; + }; + // Check if there is only one consumer of the output of the rescale and try to fuse to that operation. // Note: For input fusing we cannot have an output zero point on the Rescale operation (since the // zero point is applied before scaling on inputs), however input zero point is fine. if ( ofmConn->tensor->Readers().size() == 1 && ofmConn->quantization.zeroPoints == Quantization::Unit().zeroPoints ) { - // Copies quantization information from ifm connection and (converted) scales from ofm connection, - // since these are the scales we want to apply. - auto CopyQuantizationAndConvertScales = [](const TensorConnection *ic, const TensorConnection *oc) - { - auto result = ic->quantization; - result.scales = oc->quantization.scales; - // Convert scales to have 0 shift if possible, since this can - // improve fusing for Ethos-U55/65 - for ( auto &qs : result.scales ) - { - if ( qs.shift > 0 && qs.shift < 31 && (qs.scale % (1 << qs.shift)) == 0 ) - { - qs = {(qs.scale >> qs.shift), 0}; - } - } - return result; - }; // Propagate rescaling to input of next op auto consumer = ofmConn->tensor->Readers().front(); - auto ifmQuant = CopyQuantizationAndConvertScales(ifmConn, ofmConn); + + // Copy quantization information from ifm connection and (converted) scales from ofm connection, + // since these are the scales we want to apply. + auto ifmQuant = ifmConn->quantization; + // Normalize scales to shift 0 if possible + ifmQuant.scales = ConvertedScales(ofmConn); + for ( auto ifm : consumer->Inputs().pairs() ) { if ( ifm.second.tensor == ofmConn->tensor ) @@ -883,8 +886,9 @@ Operation *GraphIrOptimiser::FuseRescale(Graph *const graph, Operation *const op if ( otherProducer && otherProducer->Type() == OpType::Rescale ) { // Check if the other ifm rescale can be fused - auto otherIfmQuant = CopyQuantizationAndConvertScales( - otherProducer->Input(TensorUsage::IFM), otherProducer->Output(TensorUsage::OFM)); + auto otherIfmQuant = otherProducer->Input(TensorUsage::IFM)->quantization; + otherIfmQuant.scales = ConvertedScales(otherProducer->Output(TensorUsage::OFM)); + if ( otherIfmCon->quantization.EqualScales(Quantization::Unit()) && _constraints->SupportsFusedRescale(consumer->Type(), TensorUsage::IFM, otherProducer->IFM(0)->Type(), otherProducer->OFM()->Type(), otherIfmQuant) ) @@ -910,10 +914,15 @@ Operation *GraphIrOptimiser::FuseRescale(Graph *const graph, Operation *const op // is only one producer of the input to the rescale operation. If this input has no zero point // adjustment and the producers output has unit scaling, it might be possible to fuse this rescale to // the producers output if the constraints of the architecture allows it + + // Normalize scales to shift 0 if possible + auto ofmQuant = ofmConn->quantization; + ofmQuant.scales = ConvertedScales(ofmConn); + if ( returnOp == operation && producer && producer->Output(TensorUsage::OFM)->quantization.EqualScales(Quantization::Unit()) && ifmConn->quantization.zeroPoints == Quantization::Unit().zeroPoints && - _constraints->SupportsFusedRescale(producer->Type(), TensorUsage::OFM, producer->IFM(0)->Type(), - ofmConn->tensor->Type(), ofmConn->quantization) ) + _constraints->SupportsFusedRescale( + producer->Type(), TensorUsage::OFM, producer->IFM(0)->Type(), ofmConn->tensor->Type(), ofmQuant) ) { // Propagate rescaling to output of previous op producer->CopyOutput(TensorUsage::OFM, *ofmConn); -- GitLab