From 6770537febd0a97b8b2f8562aa4ac392d468e972 Mon Sep 17 00:00:00 2001
From: Johan Gunnarsson <johan.gunnarsson@arm.com>
Date: Wed, 2 Jul 2025 15:01:41 +0200
Subject: [PATCH] MLBEDSW-10899: Handle RESCALE with output unsigned properly

Networks with a RESCALE that writes to graph output and has
output_unsigend set lost the unsignedness of the output tensor
when the ReinterpretCast op was handled. This caused wrong OFM
clipping and zero point.

This patch changes the following:

* Don't add REINTERPRETCAST before/after RESCALE. Instead, keep
  the RESCALE and its attributes and deal with the unsigned
  input/output when lowering SchedIR to HLC.
* Reset all buffers when cloning IFM. This is important when
  cloning IFM with the purpose of creating a new intermediate
  tensor. If the IFM is constant, the cloned tensor will inherit
  the contant data as well, which is not desired in most cases.

Signed-off-by: Johan Gunnarsson <johan.gunnarsson@arm.com>
Change-Id: Ie0adbdfbee18e92568acb1d1f44637fbdd309e8d
---
 ethosu/regor/common/data_type.hpp             |  5 ++
 ethosu/regor/compiler/graphir_optimiser.cpp   | 54 +------------------
 ethosu/regor/compiler/graphir_optimiser.hpp   |  1 -
 .../high_level_command_stream_generator.cpp   | 30 +++++++++++
 ethosu/regor/compiler/optimiser_utils.cpp     |  3 +-
 ethosu/regor/compiler/scheduler.cpp           |  5 ++
 ethosu/regor/compiler/scheduler_decompose.cpp |  6 +++
 .../regor/compiler/tflite_graph_optimiser.cpp |  2 +
 8 files changed, 52 insertions(+), 54 deletions(-)
diff --git a/ethosu/regor/common/data_type.hpp b/ethosu/regor/common/data_type.hpp
index bb02cb1c..6c40f13d 100644
--- a/ethosu/regor/common/data_type.hpp
+++ b/ethosu/regor/common/data_type.hpp
@@ -226,6 +226,11 @@ inline std::string DataTypeToString(const DataType type)
     return EnumToString<DataType>(type);
 }
 
+inline constexpr DataType DataTypeSetSignedness(DataType type, bool setSigned)
+{
+    return (type & ~unsigned(DataType::Signed)) | (setSigned ? DataType::Signed : DataType::None);
+}
+
 inline constexpr bool IsInteger(DataType type)
 {
     return (type & DataType::Int) == DataType::Int;
diff --git a/ethosu/regor/compiler/graphir_optimiser.cpp b/ethosu/regor/compiler/graphir_optimiser.cpp
index efa5e3c4..2f80e6ae 100644
--- a/ethosu/regor/compiler/graphir_optimiser.cpp
+++ b/ethosu/regor/compiler/graphir_optimiser.cpp
@@ -99,6 +99,7 @@ Tensor *GraphIrOptimiser::ConvertBool8Tensors(Graph *graph, Tensor *tensor)
         {
             // Replace the OFM of ops producing the graph output tensor
             std::shared_ptr<Tensor> newTensor = tensor->Clone();
+            newTensor->SetBuffer(nullptr);
             newTensor->SetName(newTensor->Name() + "_int8");
             std::shared_ptr<Tensor> graphOutputTensor = tensor->shared_from_this();
             ReplaceProducerOutput(graphOutputTensor->Writers(), graphOutputTensor.get(), newTensor);
@@ -681,58 +682,6 @@ Operation *GraphIrOptimiser::RewriteRescaleInputs(Graph *const, Operation *const
     return returnOp;
 }
 
-Operation *GraphIrOptimiser::RemoveRescaleUnsignedAttribute(Graph *const, Operation *const operation)
-{
-    OpType opType = operation->Type();
-    if ( opType == OpType::Rescale )
-    {
-        auto signAttr = operation->Attribute<sign_attr_t>();
-        if ( signAttr->input_unsigned )
-        {
-            const auto &ifmConn = operation->Input(TensorUsage::IFM0);
-            DataType ifmType = ifmConn->tensor->Type();
-            auto newIfmType = ifmType & ~unsigned(DataType::Signed);
-
-            // Create a reinterpret OP to reinterpret the input as unsigned
-            auto reinterpretOp = std::make_shared<Operation>(OpType::ReinterpretCast);
-
-            // Create an unsigned data type tensor for the reinterpret OP
-            std::shared_ptr unsignedTensor = ifmConn->tensor->Clone();
-            unsignedTensor->ChangeType(newIfmType);
-
-            // Connect the reinterpret OP between the rescale and the rescale IFM
-            reinterpretOp->CopyInput(TensorUsage::IFM, *ifmConn);
-            reinterpretOp->ConnectOutput(TensorUsage::OFM, unsignedTensor);
-
-            // Connect the rescale OP input to the unsigned data type tensor
-            operation->ConnectInput(TensorUsage::IFM, unsignedTensor);
-            signAttr->input_unsigned = false;
-        }
-        if ( signAttr->output_unsigned )
-        {
-            const auto &ofmConn = operation->Output(TensorUsage::OFM);
-            DataType ofmType = ofmConn->tensor->Type();
-            auto newOfmType = ofmType & ~unsigned(DataType::Signed);
-
-            // Create a reinterpret OP to reinterpret the input as unsigned
-            auto reinterpretOp = std::make_shared<Operation>(OpType::ReinterpretCast);
-
-            // Create an unsigned data type tensor for the reinterpret OP
-            std::shared_ptr unsignedTensor = ofmConn->tensor->Clone();
-            unsignedTensor->ChangeType(newOfmType);
-
-            // Connect the reinterpret OP between the rescale and the rescale OFM
-            reinterpretOp->ConnectInput(TensorUsage::IFM, unsignedTensor);
-            reinterpretOp->CopyOutput(TensorUsage::OFM, *ofmConn);
-
-            // Connect the rescale OP output to the unsigned data type tensor
-            operation->ConnectOutput(TensorUsage::OFM, unsignedTensor);
-            signAttr->output_unsigned = false;
-        }
-    }
-    return operation;
-}
-
 /*
  * Lower Rescale into one (or more) 32-bit elementwise MUL operations.
  * Multipliers are moved to a constant-tensor, while the shift value is keps as ofm-quantization
@@ -1723,6 +1672,7 @@ Operation *GraphIrOptimiser::RewriteReduceSum(Graph *const graph, Operation *con
 
                     // Temporary tensor between ReduceSum and Sub
                     std::shared_ptr<Tensor> reduceSumTens = ofmConn->tensor->Clone();
+                    reduceSumTens->SetBuffer(nullptr);
                     reduceSumTens->SetName(ofmConn->tensor->Name() + "_reducesum");
                     reduceSumTens->ChangeType(DataType::Int32);
                     reduceSumTens->SetStorageShape(ofmConn->shape);
diff --git a/ethosu/regor/compiler/graphir_optimiser.hpp b/ethosu/regor/compiler/graphir_optimiser.hpp
index e36c308e..9b36d2af 100644
--- a/ethosu/regor/compiler/graphir_optimiser.hpp
+++ b/ethosu/regor/compiler/graphir_optimiser.hpp
@@ -130,7 +130,6 @@ private:
             {
                 &GraphIrOptimiser::ConvertZeroPointTensors,
                 &GraphIrOptimiser::RewriteRescaleInputs,
-                &GraphIrOptimiser::RemoveRescaleUnsignedAttribute,
                 &GraphIrOptimiser::FuseRescale,  // First pass fuse all possible ifm and ofm rescales
             }
         },
diff --git a/ethosu/regor/compiler/high_level_command_stream_generator.cpp b/ethosu/regor/compiler/high_level_command_stream_generator.cpp
index b3efcf5c..48900207 100644
--- a/ethosu/regor/compiler/high_level_command_stream_generator.cpp
+++ b/ethosu/regor/compiler/high_level_command_stream_generator.cpp
@@ -289,6 +289,11 @@ static HLCSubOperation MakeSubOperation(const std::unique_ptr<SchedulerOperation
     hlcSubOp.type = schedOp->Type();
     auto lutConn = schedOp->TryInput(TensorUsage::LUT);
     size_t ifms = 0;
+    sign_attr_t *signAttr = nullptr;
+    if ( schedOp->Type() == OpType::Rescale && schedOp->HasAttribute<sign_attr_t>() )
+    {
+        signAttr = schedOp->Attribute<sign_attr_t>();
+    }
     for ( const auto &input : schedOp->inputs.pairs() )
     {
         if ( IsIFM(input.first) || GetUsageType(input.first) == TensorUsage::Scratch )
@@ -308,9 +313,19 @@ static HLCSubOperation MakeSubOperation(const std::unique_ptr<SchedulerOperation
                 at = hlcSubOp.ifm.emplace(hlcSubOp.ifm.end());
             }
             MakeFeatureMap(input.first, &input.second, *at);
+            if ( signAttr )
+            {
+                // Fixup IFM datatype signedness for rescale ops
+                at->dataType = DataTypeSetSignedness(at->dataType, !signAttr->input_unsigned);
+            }
         }
     }
     MakeFeatureMap(TensorUsage::OFM, schedOp->OFM(), hlcSubOp.ofm);
+    if ( signAttr )
+    {
+        // Fixup OFM datatype signedness for rescale ops
+        hlcSubOp.ofm.dataType = DataTypeSetSignedness(hlcSubOp.ofm.dataType, !signAttr->output_unsigned);
+    }
 
     hlcSubOp.srcId = schedOp->Uid();
 
@@ -340,6 +355,11 @@ static std::shared_ptr<HLCOperation> MakeOperation(SchedulerOperation *schedOp,
     op->config = opInfo->Config();
     op->srcId = schedOp->Uid();
     size_t ifms = 0;
+    sign_attr_t *signAttr = nullptr;
+    if ( schedOp->Type() == OpType::Rescale && schedOp->HasAttribute<sign_attr_t>() )
+    {
+        signAttr = schedOp->Attribute<sign_attr_t>();
+    }
     for ( const auto &input : schedOp->inputs.pairs() )
     {
         if ( IsIFM(input.first) || GetUsageType(input.first) == TensorUsage::Scratch )
@@ -358,9 +378,19 @@ static std::shared_ptr<HLCOperation> MakeOperation(SchedulerOperation *schedOp,
                 at = op->ifm.emplace(op->ifm.end());
             }
             MakeFeatureMap(input.first, &input.second, *at);
+            if ( signAttr )
+            {
+                // Fixup IFM datatype signedness for rescale ops
+                at->dataType = DataTypeSetSignedness(at->dataType, !signAttr->input_unsigned);
+            }
         }
     }
     MakeFeatureMap(TensorUsage::OFM, schedOp->OFM(), op->ofm);
+    if ( signAttr )
+    {
+        // Fixup OFM datatype signedness for rescale ops
+        op->ofm.dataType = DataTypeSetSignedness(op->ofm.dataType, !signAttr->output_unsigned);
+    }
 
 #ifndef NDEBUG
     op->name = schedOp->OFM()->tensor->Name();
diff --git a/ethosu/regor/compiler/optimiser_utils.cpp b/ethosu/regor/compiler/optimiser_utils.cpp
index f52fa0e1..bcf90fda 100644
--- a/ethosu/regor/compiler/optimiser_utils.cpp
+++ b/ethosu/regor/compiler/optimiser_utils.cpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -38,6 +38,7 @@ bool IsTensorInVector(const std::vector<std::shared_ptr<Tensor>> &tensorVec, con
 std::shared_ptr<Operation> InsertCopyOpAfterTensor(const std::shared_ptr<Tensor> &ifm, const Quantization &quantization)
 {
     std::shared_ptr<Tensor> copyTensor = ifm->Clone();
+    copyTensor->SetBuffer(nullptr);
     auto copyOp = std::make_shared<Operation>(OpType::MemoryCopy);
     copyOp->ConnectInput(TensorUsage::IFM0, ifm).Set(quantization);
     auto name = ifm->Name();
diff --git a/ethosu/regor/compiler/scheduler.cpp b/ethosu/regor/compiler/scheduler.cpp
index d2b6ca13..77eda599 100644
--- a/ethosu/regor/compiler/scheduler.cpp
+++ b/ethosu/regor/compiler/scheduler.cpp
@@ -309,6 +309,11 @@ int Scheduler::UpdateSchedulerTensor(TensorUsage usage, SchedulerConnection *con
         ArchOperatorQuery query;
         Set(query.ifm[0], consumer->TryIFM(0));
         Set(query.ifm[1], consumer->TryIFM(1));
+        if ( consumer->Type() == OpType::Rescale && consumer->HasAttribute<sign_attr_t>() )
+        {
+            const auto attr = consumer->Attribute<sign_attr_t>();
+            query.ifm[0].type = DataTypeSetSignedness(query.ifm[0].type, !attr->input_unsigned);
+        }
         query.transposeMask = consumer->OFM()->transpose;
         if ( _arch->Constraints()->OperatorQuery(consumer->Type(), &query, &req).Any(QueryResult::Native) )
         {
diff --git a/ethosu/regor/compiler/scheduler_decompose.cpp b/ethosu/regor/compiler/scheduler_decompose.cpp
index 11c549f9..0c43470f 100644
--- a/ethosu/regor/compiler/scheduler_decompose.cpp
+++ b/ethosu/regor/compiler/scheduler_decompose.cpp
@@ -39,6 +39,12 @@ Flags<QueryResult> OperatorQuery(Architecture *arch, const SchedulerOperation *s
     Set(query.ifm[0], schedOp->IFM(0));
     Set(query.ifm[1], schedOp->TryIFM(1));
     Set(query.ofm, ofmConn);
+    if ( schedOp->Type() == OpType::Rescale && schedOp->HasAttribute<sign_attr_t>() )
+    {
+        const auto attr = schedOp->Attribute<sign_attr_t>();
+        query.ifm[0].type = DataTypeSetSignedness(query.ifm[0].type, !attr->input_unsigned);
+        query.ofm.type = DataTypeSetSignedness(query.ofm.type, !attr->output_unsigned);
+    }
     const auto weights = schedOp->TryInput(TensorUsage::Weights);
     const auto scales = schedOp->TryInput(TensorUsage::Scales);
     const bool constantWeights = weights && weights->tensor && weights->tensor->IsConstant();
diff --git a/ethosu/regor/compiler/tflite_graph_optimiser.cpp b/ethosu/regor/compiler/tflite_graph_optimiser.cpp
index 9fd7a2a7..fdd716c3 100644
--- a/ethosu/regor/compiler/tflite_graph_optimiser.cpp
+++ b/ethosu/regor/compiler/tflite_graph_optimiser.cpp
@@ -185,6 +185,7 @@ Operation *TFLiteGraphOptimiser::ConvertLeakyRelu16bit(TensorConnection &ifmConn
         // Create Minimum(IFM, 0)
         std::shared_ptr<Tensor> zeroTens = CreateConstTensor("zero_const", ifmConn.tensor->Type(), 0);
         std::shared_ptr<Tensor> fmNegative = ifmConn.tensor->Clone();
+        fmNegative->SetBuffer(nullptr);
         auto minOp = std::make_shared<Operation>(OpType::Minimum);
         minOp->CopyInput(TensorUsage::IFM0, ifmConn);
         minOp->ConnectInput(TensorUsage::IFM1, zeroTens).Set(ifmConn.quantization);
@@ -2109,6 +2110,7 @@ Operation *TFLiteGraphOptimiser::ConvertPrelu(Graph *const graph, Operation *con
 
         std::shared_ptr<Tensor> zeroTens = CreateConstTensor("zero_const", ifmConn->tensor->Type(), 0);
         std::shared_ptr<Tensor> fmNegative = ifmConn->tensor->Clone();
+        fmNegative->SetBuffer(nullptr);
         std::shared_ptr<Tensor> fmAlpha = ofmConn->tensor->Clone();
         std::shared_ptr<Tensor> fmScaled = ofmConn->tensor->Clone();
 
-- 
GitLab