From a7c609044e7966f00266d8da641ac0aef4f17e5d Mon Sep 17 00:00:00 2001
From: Alexander Bengtsson <Alexander.Bengtsson@arm.com>
Date: Tue, 3 Dec 2024 16:33:24 +0100
Subject: [PATCH] MLBEDSW-9431: Convert unsupported Rescales to elementwise Mul

- Handle Rescale fallback by converting to Elementwise Mul
- Adds support for int32 input to Rescales for Ethos-U55 and Ethos-U65

Change-Id: I0043f0b015f5dd52ddb8d4112e2c2ddf67a25dd7
Signed-off-by: Alexander Bengtsson <Alexander.Bengtsson@arm.com>
---
 .../architecture/architecture_constraints.hpp |   1 +
 .../ethosu55/ethos_u55_constraints.cpp        |   6 +
 .../ethosu55/ethos_u55_constraints.hpp        |   1 +
 .../ethosu85/ethos_u85_constraints.cpp        |   7 +
 .../ethosu85/ethos_u85_constraints.hpp        |   1 +
 ethosu/regor/compiler/graphir_optimiser.cpp   | 121 +++++++++++++++++-
 ethosu/regor/compiler/graphir_optimiser.hpp   |   4 +-
 7 files changed, 139 insertions(+), 2 deletions(-)

diff --git a/ethosu/regor/architecture/architecture_constraints.hpp b/ethosu/regor/architecture/architecture_constraints.hpp
index f2bde6a0..70931bcf 100644
--- a/ethosu/regor/architecture/architecture_constraints.hpp
+++ b/ethosu/regor/architecture/architecture_constraints.hpp
@@ -95,6 +95,7 @@ public:
     virtual bool SupportsReverse(OpType opType, ReverseType reverseTypeMask) = 0;
     virtual bool SupportsFusedRescale(OpType opType, TensorUsage tensorUsage, DataType fromType, DataType toType,
         const Quantization &quantization) = 0;
+    virtual bool SupportsRescale(DataType fromType, DataType toType) = 0;
     virtual bool SupportsTranspose(OpType opType, TransposeType transposeType) = 0;
     virtual bool SupportsAccumulatorSaveRestore() = 0;
 
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp
index 29bf2b5b..7bdea0d1 100644
--- a/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp
@@ -113,6 +113,12 @@ bool EthosU55Constraints::SupportsFusedRescale(
     return false;
 }
 
+bool EthosU55Constraints::SupportsRescale(DataType fromType, DataType toType)
+{
+    UNUSED(toType);
+    return DataTypeSizeBits(fromType) <= 16;
+}
+
 bool EthosU55Constraints::SupportsGather(OpType opType)
 {
     UNUSED(opType);
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.hpp b/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.hpp
index 23e3f59c..b12223c7 100644
--- a/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.hpp
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.hpp
@@ -32,6 +32,7 @@ public:
     bool SupportsTranspose(OpType opType, TransposeType transposeType) override;
     bool SupportsReverse(OpType opType, ReverseType reverseTypeMask) override;
     bool SupportsFusedRescale(OpType opType, TensorUsage tensorUsage, DataType fromType, DataType toType, const Quantization &quantization) override;
+    bool SupportsRescale(DataType fromType, DataType toType) override;
     bool SupportsAccumulatorSaveRestore() override { return false; }
     bool SupportsGather(OpType opType) override;
     bool SupportsScatter(OpType opType) override;
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp
index 11b58b3c..339b3cd4 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp
@@ -146,6 +146,13 @@ bool EthosU85Constraints::SupportsFusedRescale(
     return false;
 }
 
+bool EthosU85Constraints::SupportsRescale(DataType fromType, DataType toType)
+{
+    UNUSED(fromType);
+    UNUSED(toType);
+    return true;
+}
+
 bool EthosU85Constraints::SupportsGather(OpType opType)
 {
     EthosU85NpuOp npuOp = ArchEthosU85::GetHWOp(opType);
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.hpp
index ae3f2593..b06795e5 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.hpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.hpp
@@ -32,6 +32,7 @@ public:
     bool SupportsTranspose(OpType opType, TransposeType transposeType) override;
     bool SupportsReverse(OpType opType, ReverseType reverseTypeMask) override;
     bool SupportsFusedRescale(OpType opType, TensorUsage tensorUsage, DataType fromType, DataType toType, const Quantization &quantization) override;
+    bool SupportsRescale(DataType fromType, DataType toType) override;
     bool SupportsAccumulatorSaveRestore() override { return true; }
     bool SupportsGather(OpType opType) override;
     bool SupportsScatter(OpType opType) override;
diff --git a/ethosu/regor/compiler/graphir_optimiser.cpp b/ethosu/regor/compiler/graphir_optimiser.cpp
index 2d5e84bb..c96c4858 100644
--- a/ethosu/regor/compiler/graphir_optimiser.cpp
+++ b/ethosu/regor/compiler/graphir_optimiser.cpp
@@ -490,7 +490,7 @@ Operation *GraphIrOptimiser::RewriteFullyConnected(Graph *const graph, Operation
     return returnOp;
 }
 
-Operation *GraphIrOptimiser::RewriteRescale(Graph *const, Operation *const operation)
+Operation *GraphIrOptimiser::RewriteRescaleInputs(Graph *const, Operation *const operation)
 {
     Operation *returnOp = operation;
     OpType opType = operation->Type();
@@ -530,6 +530,125 @@ Operation *GraphIrOptimiser::RewriteRescale(Graph *const, Operation *const opera
     return returnOp;
 }
 
+/*
+ * Lower 32-bit Rescale into one (or more) elementwise MUL operations.
+ * Multipliers are moved to a constant-tensor, while the shift value is kept as ofm-quantization
+ *
+ *     IFM (32-bit)              IFM (32-bit)  Multipliers (32-bit)
+ *          |                             \   /
+ *       Rescale           --->            MUL
+ *          |                               |
+ *   OFM (any format)                OFM (any format)
+ *
+ * Global-scaling (one global multiplier):
+ *      Converted into one MUL operation
+ *
+ * Per-Channel scaling (one multiplier per channel):
+ *      The algorithm will attempt to adjust scales to a common shift representation
+ *      to pack consecutive channels into the same MUL operation.
+ *      This can be done as long as the adjustment can be made without precision-loss.
+ *      Worst-case, per-channel scaling is handled with one MUL-operation per channel
+ */
+Operation *GraphIrOptimiser::RewriteRescale(Graph *const, Operation *const operation)
+{
+    Operation *returnOp = operation;
+    OpType opType = operation->Type();
+    if ( opType == OpType::Rescale )
+    {
+        const auto &ifmConn = operation->Input(TensorUsage::IFM0);
+        const auto &ofmConn = operation->Output(TensorUsage::OFM);
+        const Quantization &quant = ofmConn->quantization;
+        DataType ifmType = ifmConn->tensor->Type();
+        DataType ofmType = ofmConn->tensor->Type();
+        const auto attr = operation->Attribute<rescale_attr_t>();
+        if ( attr->input_unsigned )
+        {
+            ifmType = ifmType & ~unsigned(DataType::Signed);
+        }
+        if ( attr->output_unsigned )
+        {
+            ofmType = ofmType & ~unsigned(DataType::Signed);
+        }
+        if ( ifmType == DataType::Int32 && !_constraints->SupportsRescale(ifmType, ofmType) )
+        {
+            auto CreateRescalingMul = [ifmConn, ofmConn](int startChannel, int endChannel, std::vector<int32_t> &scales, int shift)
+            {
+                Shape sliceOffset = ifmConn->shape.WithZeros().WithDepth(startChannel);
+                Shape sliceShape = ifmConn->shape.WithDepth(endChannel - startChannel);
+                TensorSlice slice{sliceOffset, sliceShape};
+
+                auto mulOp = std::make_shared<Operation>(OpType::Mul);
+                auto buf = std::make_shared<Buffer>(scales.size(), scales.data());
+                auto scaleTensor = CreateConstTensor(fmt::format("multipliers_{}_{}", startChannel, endChannel - 1), DataType::Int32, buf);
+
+                Quantization scaleQuant = Quantization::Unit();
+                scaleQuant.type = QuantizationType::EXPLICIT;
+
+                Quantization ifmQuant = ifmConn->quantization;
+                ifmQuant.scales.clear();
+                ifmQuant.scales.push_back({1, 0});
+                ifmQuant.type = QuantizationType::EXPLICIT;
+
+                Quantization ofmQuant = ofmConn->quantization;
+                ofmQuant.scales.clear();
+                ofmQuant.scales.push_back({1, shift});
+                ofmQuant.type = QuantizationType::EXPLICIT;
+
+                mulOp->ConnectInput(TensorUsage::IFM1, scaleTensor);
+                mulOp->CopyInput(TensorUsage::IFM0, *ifmConn);
+                mulOp->CopyOutput(TensorUsage::OFM, *ofmConn);
+
+                mulOp->Input(TensorUsage::IFM1)->Set(scaleQuant);
+                mulOp->Input(TensorUsage::IFM0)->Set(ifmQuant).Set(slice);
+                mulOp->Output(TensorUsage::OFM)->Set(ofmQuant).Set(slice);
+                return mulOp;
+            };
+
+            // Use the first channels shift-value as reference shift
+            // try to adjust multipliers to pack as many consecutive channels in the same mul-operation
+            int shift = quant.scales[0].shift;
+            std::vector<int32_t> scales;
+            int startChannel = 0;
+            for ( auto qscale : quant.scales )
+            {
+                int shiftDiff = qscale.shift - shift;
+                // try to right-shift scale without precision-loss
+                // This can be done if the scale is evenly divisible by 2^shift
+                if ( (shiftDiff >= 0) && (qscale.scale % (1UL << shiftDiff) == 0) )
+                {
+                    scales.push_back(qscale.scale >> shiftDiff);
+                }
+                else
+                {
+                    // Could not adjust the scale without precision loss.
+                    // Create elementwise mul operation to handle all the previous scales
+                    int endChannel = startChannel + scales.size();
+                    auto mulOp = CreateRescalingMul(startChannel, endChannel, scales, shift);
+                    mulOp->SetRounding(operation->Rounding());
+                    RecordOptimisation(operation, mulOp.get());
+
+                    // reset scales and startChannel
+                    startChannel = endChannel;
+                    scales.clear();
+                    scales.push_back(qscale.scale);
+
+                    // update target shift to the current shift-value
+                    shift = qscale.shift;
+                }
+            }
+
+            // Emit the final mul operation (or the only one for global scaling)
+            int endChannel = ifmConn->shape.Depth();
+            auto mulOp = CreateRescalingMul(startChannel, endChannel, scales, shift);
+            mulOp->SetRounding(operation->Rounding());
+            RecordOptimisation(operation, mulOp.get());
+            returnOp = mulOp.get();
+            operation->Disconnect();
+        }
+    }
+    return returnOp;
+}
+
 // Rewrite TOSA PAD to number of MemoryCopy ops
 Operation *GraphIrOptimiser::RewritePad(Graph *const, Operation *const operation)
 {
diff --git a/ethosu/regor/compiler/graphir_optimiser.hpp b/ethosu/regor/compiler/graphir_optimiser.hpp
index 3ff0b410..cd3720ff 100644
--- a/ethosu/regor/compiler/graphir_optimiser.hpp
+++ b/ethosu/regor/compiler/graphir_optimiser.hpp
@@ -49,6 +49,7 @@ private:
     Tensor *ConvertInt4Tensors(Graph *graph, Tensor *tensor);
     Operation *RewriteFullyConnected(Graph *const graph, Operation *const operation);
     Operation *FixupPoolStrides(Graph *const, Operation *const operation);
+    Operation *RewriteRescaleInputs(Graph *const graph, Operation *const operation);
     Operation *RewriteRescale(Graph *const graph, Operation *const operation);
     Operation *RewritePad(Graph *const graph, Operation *const operation);
     Operation *FuseRescale(Graph *const graph, Operation *const operation);
@@ -110,7 +111,7 @@ private:
             {},
             {
                 &GraphIrOptimiser::ConvertAttributes,
-                &GraphIrOptimiser::RewriteRescale,
+                &GraphIrOptimiser::RewriteRescaleInputs,
                 &GraphIrOptimiser::FuseRescale,  // First pass fuse all possible ifm and ofm rescales
             }
         },
@@ -124,6 +125,7 @@ private:
             {},
             {
                 &GraphIrOptimiser::ConvertAttributes,
+                &GraphIrOptimiser::RewriteRescale,
                 &GraphIrOptimiser::ConvertResizeOffsets,
                 &GraphIrOptimiser::RewriteFullyConnected,
                 &GraphIrOptimiser::FixupPoolStrides,
-- 
GitLab