From 498f957b431c2aec4101199323dfaae7f341d619 Mon Sep 17 00:00:00 2001
From: Max Bergfelt <max.bergfelt@arm.com>
Date: Thu, 10 Apr 2025 15:33:00 +0200
Subject: [PATCH] MLBEDSW-10635: ReinterpretCast operator and cast to int64

Implemented a ReinterpretCast operator which can be used to reinterpret tensors with different data types and sizes. Additionally added support for non hardware supported cast to in64 by replacing the operation with 4 sequential cast and reinterpret operations.

Change-Id: Ie7032cd5384c17a766dd17034cd59871bb1a833d
Signed-off-by: Max Bergfelt <max.bergfelt@arm.com>
---
 ethosu/regor/compiler/graphir_optimiser.cpp   | 63 ++++++++++++++++++-
 ethosu/regor/compiler/op_type.cpp             |  3 +-
 ethosu/regor/compiler/op_type.hpp             |  1 +
 ethosu/regor/compiler/scheduler_packing.cpp   | 51 +++++++++++++++
 ethosu/regor/compiler/scheduler_packing.hpp   |  1 +
 .../test/test_tflite_supported_operators.cpp  |  2 +-
 .../tflite/tflite_supported_operators_u55.cpp |  1 +
 7 files changed, 119 insertions(+), 3 deletions(-)
diff --git a/ethosu/regor/compiler/graphir_optimiser.cpp b/ethosu/regor/compiler/graphir_optimiser.cpp
index 0adb5efc..af15538f 100644
--- a/ethosu/regor/compiler/graphir_optimiser.cpp
+++ b/ethosu/regor/compiler/graphir_optimiser.cpp
@@ -1054,7 +1054,7 @@ Operation *GraphIrOptimiser::RewriteTable(Graph *const graph, Operation *const o
     return returnOp;
 }
 
-// Rewrite TOSA Cast to other ops
+// Rewrite TOSA Cast and int64 cast to other ops
 Operation *GraphIrOptimiser::RewriteCast(Graph *const, Operation *const operation)
 {
     Operation *returnOp = operation;
@@ -1064,6 +1064,67 @@ Operation *GraphIrOptimiser::RewriteCast(Graph *const, Operation *const operatio
         const auto ifmConn = operation->Input(TensorUsage::IFM);
         const auto ofmConn = operation->Output(TensorUsage::OFM);
 
+        auto ofmType = ofmConn->tensor->Type();
+        /* Casting to int32 is hardware supported, but casting to int64 is not. We solve this by converting
+         * the int64 cast to a series of operations in the following if statement. This does not work for int32 input.
+         * 1. Cast the input to an int32 tensor.
+         * The tensor size is kept the same (WxHxC -> WxHxC) but the memory size is doubled.
+         * 2. Reinterpret the tensor as an int16 tensor.
+         * The tensor size is doubled (WxHxC -> WxHx2C), where every second element is 0xFFFF / 0x0000 for
+         * negative / positive numbers. Memory size is unchanged.
+         * 3. Cast the reinterpreted input to an int32 tensor again.
+         * The tensor size is again the same (WxHx2C -> WxHx2C) but the size in memory is double.
+         * 4. Finally, reinterpret the result as an int64 tensor.
+         * The 0x0000FFFF / 0x00000000 elements becomes most significant bits of the int64 values.
+         * Tensor size (WxHx2C -> WxHxC) */
+        if ( (ofmType == DataType::Int64) || (ofmType == DataType::UInt64) )
+        {
+            bool allowedDataType = ifmConn->tensor->Type() != DataType::Int32 && ifmConn->tensor->Type() != DataType::UInt32;
+            assert(allowedDataType && "Casting from int32 to int64 is not supported.");
+
+            const int c = ifmConn->shape.Depth();
+
+            // Create intermediate tensor for the casting
+            const auto intermediate32Bit = std::make_shared<Tensor>("intermediate_32bit", DataType::Int32, ifmConn->shape);
+
+            // Create double size intermediate tensor for the casting
+            const auto intermediate16Bit2xSize = std::make_shared<Tensor>(
+                "intermediate16Bit2xSize", DataType::Int16, ifmConn->shape.WithDepth(2 * c));
+
+            // Create double size intermediate tensor for the casting
+            const auto intermediate32Bit2xSize = std::make_shared<Tensor>(
+                "intermediate32Bit2xSize", DataType::Int32, ifmConn->shape.WithDepth(2 * c));
+
+            // Connect the cast output to the newly created tensor
+            const auto castOp1 = std::make_shared<Operation>(OpType::Cast);
+            castOp1->CopyInput(TensorUsage::IFM, *ifmConn);
+            castOp1->ConnectOutput(TensorUsage::OFM, intermediate32Bit);
+            RecordOptimisation(operation, castOp1.get());
+
+            // Create reinterpret cast op to reinterpret to 16 bit, double size
+            const auto reinterpretOp1 = std::make_shared<Operation>(OpType::ReinterpretCast);
+            reinterpretOp1->ConnectInput(TensorUsage::IFM, intermediate32Bit);
+            reinterpretOp1->ConnectOutput(TensorUsage::OFM, intermediate16Bit2xSize);
+            RecordOptimisation(operation, reinterpretOp1.get());
+
+            // Create additional cast op
+            const auto castOp2 = std::make_shared<Operation>(OpType::Cast);
+            castOp2->ConnectInput(TensorUsage::IFM, intermediate16Bit2xSize).Set(ifmConn->shape.WithDepth(2 * c));
+            castOp2->ConnectOutput(TensorUsage::OFM, intermediate32Bit2xSize).Set(ifmConn->shape.WithDepth(2 * c));
+            RecordOptimisation(operation, castOp2.get());
+
+            // Create the final reinterpret cast to reinterpret the result as an int64 tensor
+            const auto reinterpretOp2 = std::make_shared<Operation>(OpType::ReinterpretCast);
+            reinterpretOp2->ConnectInput(TensorUsage::IFM, intermediate32Bit2xSize).Set(ifmConn->shape.WithDepth(2 * c));
+            reinterpretOp2->CopyOutput(TensorUsage::OFM, *ofmConn);
+            RecordOptimisation(operation, reinterpretOp2.get());
+
+            ofmConn->quantization = Quantization::Unit();
+            operation->Disconnect();
+            returnOp = reinterpretOp2.get();
+            return returnOp;
+        }
+
         if ( IsBool(ifmConn->tensor->Type()) && IsInteger(ofmConn->tensor->Type()) )
         {
             // Replace CAST with BITWISE_AND to convert from internal bool representation to integer
diff --git a/ethosu/regor/compiler/op_type.cpp b/ethosu/regor/compiler/op_type.cpp
index 5e07a420..3d90d4b0 100644
--- a/ethosu/regor/compiler/op_type.cpp
+++ b/ethosu/regor/compiler/op_type.cpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2023-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -87,6 +87,7 @@ BEGIN_ENUM_TABLE(regor::OpType)
     ADD_ENUM_NAME(While)
     // Regor Internal Operators
     ADD_ENUM_NAME(MemoryCopy)
+    ADD_ENUM_NAME(ReinterpretCast)
     ADD_ENUM_NAME(Passthrough)
     ADD_ENUM_NAME(LUT)
     ADD_ENUM_NAME(AndNot)
diff --git a/ethosu/regor/compiler/op_type.hpp b/ethosu/regor/compiler/op_type.hpp
index 15d2f876..77c8ef5f 100644
--- a/ethosu/regor/compiler/op_type.hpp
+++ b/ethosu/regor/compiler/op_type.hpp
@@ -97,6 +97,7 @@ enum class OpType : uint16_t
 
     // Regor Internal Operators
     MemoryCopy,
+    ReinterpretCast,
     Passthrough,
     LUT,
     AndNot,
diff --git a/ethosu/regor/compiler/scheduler_packing.cpp b/ethosu/regor/compiler/scheduler_packing.cpp
index 64ce65cb..d6257719 100644
--- a/ethosu/regor/compiler/scheduler_packing.cpp
+++ b/ethosu/regor/compiler/scheduler_packing.cpp
@@ -137,6 +137,10 @@ void SchedulerPacking::FilterOperations(const std::vector<Operation *> &executio
     for ( Operation *op : executionList )
     {
         auto schedOp = MakeSchedulerOperation(op, graph);
+        if ( !schedOp )
+        {
+            continue;
+        }
 
         if ( ShouldDecompose(_arch, schedOp.get()) )
         {
@@ -197,6 +201,43 @@ ArchitectureOpGroupQuery SchedulerPacking::CreateOpGroupQuery(const SchedulerOpe
     return query;
 }
 
+// We handle reinterpret by catching it before we create a SchedulerOperation.
+// Mapping is modified so that the OFM GraphIR tensor of the preceding OP and
+// the GraphIR IFM tensor of the succeeding OP map to the same SchedulerTensor.
+void SchedulerPacking::HandleReinterpretCast(Operation *op, const Graph *graph)
+{
+    assert(op->Type() == OpType::ReinterpretCast && "Op Type is not ReinterpretCast.");
+
+    const auto ifmConn = op->Input(TensorUsage::IFM);
+    const auto ofmConn = op->Output(TensorUsage::OFM);
+
+    // Try finding the SchedulerTensor mapped to the ReinterpretCast OP's IFM tensor.
+    // If no preceding OP has created it, and it can't be found, create it.
+    auto pos = _tensorMap.find(ifmConn->tensor.get());
+    std::shared_ptr<SchedulerTensor> schedTensor;
+    if ( pos == _tensorMap.end() )
+    {
+        schedTensor = std::make_shared<SchedulerTensor>();
+        schedTensor->srcTensor = ifmConn->tensor;
+        InitSchedulerTensor(schedTensor.get(), ifmConn->tensor.get(), graph);
+        _tensorMap.emplace(ifmConn->tensor.get(), schedTensor);
+    }
+    else
+    {
+        schedTensor = pos->second;
+    }
+    // Ensure that both the ReinterpretCast IFM and OFM tensor maps to the same SchedulerTensor.
+    _tensorMap.emplace(ofmConn->tensor.get(), schedTensor);
+
+    // If reinterpret cast is the last OP, that means that it's output tensor is the output tensor of the network.
+    // We therefore set isGraphOutput to true and make sure the srcTensor maps to the graph output tensor.
+    if ( graph->IsOutput(ofmConn->tensor.get()) )
+    {
+        InitSchedulerTensor(schedTensor.get(), ofmConn->tensor.get(), graph);
+        schedTensor->srcTensor = ofmConn->tensor;
+    }
+}
+
 void SchedulerPacking::SchedulerPacking::PrePackOperations()
 {
     // Determine if each operation can run on NPU
@@ -540,6 +581,12 @@ std::unique_ptr<SchedulerOperation> SchedulerPacking::MakeSchedulerOperation(Ope
 {
     assert(op->Type() != OpType::None);
 
+    if ( op->Type() == OpType::ReinterpretCast )
+    {
+        HandleReinterpretCast(op, graph);
+        return nullptr;
+    }
+
     std::unique_ptr<SchedulerOperation> schedOp = std::make_unique<SchedulerOperation>(op->Type());
 
     schedOp->SetKernel(*op->Kernel());
@@ -578,6 +625,10 @@ std::unique_ptr<SchedulerOperation> SchedulerPacking::MakeSchedulerOperation(Ope
             }
             SchedulerConnection *schedConn = IsOFM(item.first) ? schedOp->AddOutput(item.first) : schedOp->AddInput(item.first);
             InitSchedulerConnection(schedConn, schedTensor, item.second);
+            if ( IsIFM(item.first) && tensor->Type() != schedTensor->dataType )
+            {
+                schedConn->SetType(tensor->Type());
+            }
             schedConn->transpose = TransposeType::None;
         }
     }
diff --git a/ethosu/regor/compiler/scheduler_packing.hpp b/ethosu/regor/compiler/scheduler_packing.hpp
index eca43ca8..0289d04b 100644
--- a/ethosu/regor/compiler/scheduler_packing.hpp
+++ b/ethosu/regor/compiler/scheduler_packing.hpp
@@ -71,6 +71,7 @@ private:
     int CanPack(const SchedulerOperation *schedOp, const SchedulerOperation *prevOp, const SchedulerOperation *op, const int prevOpKey) const;
     void InitSchedulerConnection(SchedulerConnection *schedConn, const std::shared_ptr<SchedulerTensor> &tensor, const TensorConnection &conn);
     void InitSchedulerTensor(SchedulerTensor *schedTensor, Tensor *tensor, const Graph *graph);
+    void HandleReinterpretCast(Operation *op, const Graph *graph);
     std::unique_ptr<SchedulerOperation> MakeSchedulerOperation(Operation *op, const Graph *graph);
     std::vector<std::unique_ptr<SchedulerOperation>> DecomposeSchedulerOperation(std::unique_ptr<SchedulerOperation> op);
     ArchResampling ResamplingMode(TensorUsage usage, OpType opType) const;
diff --git a/ethosu/regor/test/test_tflite_supported_operators.cpp b/ethosu/regor/test/test_tflite_supported_operators.cpp
index bbef8e5b..05aaf67e 100644
--- a/ethosu/regor/test/test_tflite_supported_operators.cpp
+++ b/ethosu/regor/test/test_tflite_supported_operators.cpp
@@ -501,7 +501,6 @@ TEST_CASE("Supported operators EthosU55")
     {
         std::set<DataType> unsupported = {
             DataType::Int48,
-            DataType::Int64,
             DataType::UInt48,
             DataType::UInt64,
             DataType::QInt,
@@ -537,6 +536,7 @@ TEST_CASE("Supported operators EthosU55")
             DataType::Int8,
             DataType::Int16,
             DataType::Int32,
+            DataType::Int64,
         };
         for ( auto dtype : unsupported )
         {
diff --git a/ethosu/regor/tflite/tflite_supported_operators_u55.cpp b/ethosu/regor/tflite/tflite_supported_operators_u55.cpp
index 5bad24e3..b8d3b77b 100644
--- a/ethosu/regor/tflite/tflite_supported_operators_u55.cpp
+++ b/ethosu/regor/tflite/tflite_supported_operators_u55.cpp
@@ -89,6 +89,7 @@ TfLiteSupportedOperatorsU55::TfLiteSupportedOperatorsU55(IArchitectureConstraint
         DataType::Int8,
         DataType::Int16,
         DataType::Int32,
+        DataType::Int64,
         // clang-format on
     };
     _maxWeightSum8Bit = 127 * (1 << 16);
-- 
GitLab