From 498f957b431c2aec4101199323dfaae7f341d619 Mon Sep 17 00:00:00 2001 From: Max Bergfelt Date: Thu, 10 Apr 2025 15:33:00 +0200 Subject: [PATCH] MLBEDSW-10635: ReinterpretCast operator and cast to int64 Implemented a ReinterpretCast operator which can be used to reinterpret tensors with different data types and sizes. Additionally added support for non hardware supported cast to in64 by replacing the operation with 4 sequential cast and reinterpret operations. Change-Id: Ie7032cd5384c17a766dd17034cd59871bb1a833d Signed-off-by: Max Bergfelt --- ethosu/regor/compiler/graphir_optimiser.cpp | 63 ++++++++++++++++++- ethosu/regor/compiler/op_type.cpp | 3 +- ethosu/regor/compiler/op_type.hpp | 1 + ethosu/regor/compiler/scheduler_packing.cpp | 51 +++++++++++++++ ethosu/regor/compiler/scheduler_packing.hpp | 1 + .../test/test_tflite_supported_operators.cpp | 2 +- .../tflite/tflite_supported_operators_u55.cpp | 1 + 7 files changed, 119 insertions(+), 3 deletions(-) diff --git a/ethosu/regor/compiler/graphir_optimiser.cpp b/ethosu/regor/compiler/graphir_optimiser.cpp index 0adb5efc..af15538f 100644 --- a/ethosu/regor/compiler/graphir_optimiser.cpp +++ b/ethosu/regor/compiler/graphir_optimiser.cpp @@ -1054,7 +1054,7 @@ Operation *GraphIrOptimiser::RewriteTable(Graph *const graph, Operation *const o return returnOp; } -// Rewrite TOSA Cast to other ops +// Rewrite TOSA Cast and int64 cast to other ops Operation *GraphIrOptimiser::RewriteCast(Graph *const, Operation *const operation) { Operation *returnOp = operation; @@ -1064,6 +1064,67 @@ Operation *GraphIrOptimiser::RewriteCast(Graph *const, Operation *const operatio const auto ifmConn = operation->Input(TensorUsage::IFM); const auto ofmConn = operation->Output(TensorUsage::OFM); + auto ofmType = ofmConn->tensor->Type(); + /* Casting to int32 is hardware supported, but casting to int64 is not. We solve this by converting + * the int64 cast to a series of operations in the following if statement. This does not work for int32 input. + * 1. Cast the input to an int32 tensor. + * The tensor size is kept the same (WxHxC -> WxHxC) but the memory size is doubled. + * 2. Reinterpret the tensor as an int16 tensor. + * The tensor size is doubled (WxHxC -> WxHx2C), where every second element is 0xFFFF / 0x0000 for + * negative / positive numbers. Memory size is unchanged. + * 3. Cast the reinterpreted input to an int32 tensor again. + * The tensor size is again the same (WxHx2C -> WxHx2C) but the size in memory is double. + * 4. Finally, reinterpret the result as an int64 tensor. + * The 0x0000FFFF / 0x00000000 elements becomes most significant bits of the int64 values. + * Tensor size (WxHx2C -> WxHxC) */ + if ( (ofmType == DataType::Int64) || (ofmType == DataType::UInt64) ) + { + bool allowedDataType = ifmConn->tensor->Type() != DataType::Int32 && ifmConn->tensor->Type() != DataType::UInt32; + assert(allowedDataType && "Casting from int32 to int64 is not supported."); + + const int c = ifmConn->shape.Depth(); + + // Create intermediate tensor for the casting + const auto intermediate32Bit = std::make_shared("intermediate_32bit", DataType::Int32, ifmConn->shape); + + // Create double size intermediate tensor for the casting + const auto intermediate16Bit2xSize = std::make_shared( + "intermediate16Bit2xSize", DataType::Int16, ifmConn->shape.WithDepth(2 * c)); + + // Create double size intermediate tensor for the casting + const auto intermediate32Bit2xSize = std::make_shared( + "intermediate32Bit2xSize", DataType::Int32, ifmConn->shape.WithDepth(2 * c)); + + // Connect the cast output to the newly created tensor + const auto castOp1 = std::make_shared(OpType::Cast); + castOp1->CopyInput(TensorUsage::IFM, *ifmConn); + castOp1->ConnectOutput(TensorUsage::OFM, intermediate32Bit); + RecordOptimisation(operation, castOp1.get()); + + // Create reinterpret cast op to reinterpret to 16 bit, double size + const auto reinterpretOp1 = std::make_shared(OpType::ReinterpretCast); + reinterpretOp1->ConnectInput(TensorUsage::IFM, intermediate32Bit); + reinterpretOp1->ConnectOutput(TensorUsage::OFM, intermediate16Bit2xSize); + RecordOptimisation(operation, reinterpretOp1.get()); + + // Create additional cast op + const auto castOp2 = std::make_shared(OpType::Cast); + castOp2->ConnectInput(TensorUsage::IFM, intermediate16Bit2xSize).Set(ifmConn->shape.WithDepth(2 * c)); + castOp2->ConnectOutput(TensorUsage::OFM, intermediate32Bit2xSize).Set(ifmConn->shape.WithDepth(2 * c)); + RecordOptimisation(operation, castOp2.get()); + + // Create the final reinterpret cast to reinterpret the result as an int64 tensor + const auto reinterpretOp2 = std::make_shared(OpType::ReinterpretCast); + reinterpretOp2->ConnectInput(TensorUsage::IFM, intermediate32Bit2xSize).Set(ifmConn->shape.WithDepth(2 * c)); + reinterpretOp2->CopyOutput(TensorUsage::OFM, *ofmConn); + RecordOptimisation(operation, reinterpretOp2.get()); + + ofmConn->quantization = Quantization::Unit(); + operation->Disconnect(); + returnOp = reinterpretOp2.get(); + return returnOp; + } + if ( IsBool(ifmConn->tensor->Type()) && IsInteger(ofmConn->tensor->Type()) ) { // Replace CAST with BITWISE_AND to convert from internal bool representation to integer diff --git a/ethosu/regor/compiler/op_type.cpp b/ethosu/regor/compiler/op_type.cpp index 5e07a420..3d90d4b0 100644 --- a/ethosu/regor/compiler/op_type.cpp +++ b/ethosu/regor/compiler/op_type.cpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2023-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -87,6 +87,7 @@ BEGIN_ENUM_TABLE(regor::OpType) ADD_ENUM_NAME(While) // Regor Internal Operators ADD_ENUM_NAME(MemoryCopy) + ADD_ENUM_NAME(ReinterpretCast) ADD_ENUM_NAME(Passthrough) ADD_ENUM_NAME(LUT) ADD_ENUM_NAME(AndNot) diff --git a/ethosu/regor/compiler/op_type.hpp b/ethosu/regor/compiler/op_type.hpp index 15d2f876..77c8ef5f 100644 --- a/ethosu/regor/compiler/op_type.hpp +++ b/ethosu/regor/compiler/op_type.hpp @@ -97,6 +97,7 @@ enum class OpType : uint16_t // Regor Internal Operators MemoryCopy, + ReinterpretCast, Passthrough, LUT, AndNot, diff --git a/ethosu/regor/compiler/scheduler_packing.cpp b/ethosu/regor/compiler/scheduler_packing.cpp index 64ce65cb..d6257719 100644 --- a/ethosu/regor/compiler/scheduler_packing.cpp +++ b/ethosu/regor/compiler/scheduler_packing.cpp @@ -137,6 +137,10 @@ void SchedulerPacking::FilterOperations(const std::vector &executio for ( Operation *op : executionList ) { auto schedOp = MakeSchedulerOperation(op, graph); + if ( !schedOp ) + { + continue; + } if ( ShouldDecompose(_arch, schedOp.get()) ) { @@ -197,6 +201,43 @@ ArchitectureOpGroupQuery SchedulerPacking::CreateOpGroupQuery(const SchedulerOpe return query; } +// We handle reinterpret by catching it before we create a SchedulerOperation. +// Mapping is modified so that the OFM GraphIR tensor of the preceding OP and +// the GraphIR IFM tensor of the succeeding OP map to the same SchedulerTensor. +void SchedulerPacking::HandleReinterpretCast(Operation *op, const Graph *graph) +{ + assert(op->Type() == OpType::ReinterpretCast && "Op Type is not ReinterpretCast."); + + const auto ifmConn = op->Input(TensorUsage::IFM); + const auto ofmConn = op->Output(TensorUsage::OFM); + + // Try finding the SchedulerTensor mapped to the ReinterpretCast OP's IFM tensor. + // If no preceding OP has created it, and it can't be found, create it. + auto pos = _tensorMap.find(ifmConn->tensor.get()); + std::shared_ptr schedTensor; + if ( pos == _tensorMap.end() ) + { + schedTensor = std::make_shared(); + schedTensor->srcTensor = ifmConn->tensor; + InitSchedulerTensor(schedTensor.get(), ifmConn->tensor.get(), graph); + _tensorMap.emplace(ifmConn->tensor.get(), schedTensor); + } + else + { + schedTensor = pos->second; + } + // Ensure that both the ReinterpretCast IFM and OFM tensor maps to the same SchedulerTensor. + _tensorMap.emplace(ofmConn->tensor.get(), schedTensor); + + // If reinterpret cast is the last OP, that means that it's output tensor is the output tensor of the network. + // We therefore set isGraphOutput to true and make sure the srcTensor maps to the graph output tensor. + if ( graph->IsOutput(ofmConn->tensor.get()) ) + { + InitSchedulerTensor(schedTensor.get(), ofmConn->tensor.get(), graph); + schedTensor->srcTensor = ofmConn->tensor; + } +} + void SchedulerPacking::SchedulerPacking::PrePackOperations() { // Determine if each operation can run on NPU @@ -540,6 +581,12 @@ std::unique_ptr SchedulerPacking::MakeSchedulerOperation(Ope { assert(op->Type() != OpType::None); + if ( op->Type() == OpType::ReinterpretCast ) + { + HandleReinterpretCast(op, graph); + return nullptr; + } + std::unique_ptr schedOp = std::make_unique(op->Type()); schedOp->SetKernel(*op->Kernel()); @@ -578,6 +625,10 @@ std::unique_ptr SchedulerPacking::MakeSchedulerOperation(Ope } SchedulerConnection *schedConn = IsOFM(item.first) ? schedOp->AddOutput(item.first) : schedOp->AddInput(item.first); InitSchedulerConnection(schedConn, schedTensor, item.second); + if ( IsIFM(item.first) && tensor->Type() != schedTensor->dataType ) + { + schedConn->SetType(tensor->Type()); + } schedConn->transpose = TransposeType::None; } } diff --git a/ethosu/regor/compiler/scheduler_packing.hpp b/ethosu/regor/compiler/scheduler_packing.hpp index eca43ca8..0289d04b 100644 --- a/ethosu/regor/compiler/scheduler_packing.hpp +++ b/ethosu/regor/compiler/scheduler_packing.hpp @@ -71,6 +71,7 @@ private: int CanPack(const SchedulerOperation *schedOp, const SchedulerOperation *prevOp, const SchedulerOperation *op, const int prevOpKey) const; void InitSchedulerConnection(SchedulerConnection *schedConn, const std::shared_ptr &tensor, const TensorConnection &conn); void InitSchedulerTensor(SchedulerTensor *schedTensor, Tensor *tensor, const Graph *graph); + void HandleReinterpretCast(Operation *op, const Graph *graph); std::unique_ptr MakeSchedulerOperation(Operation *op, const Graph *graph); std::vector> DecomposeSchedulerOperation(std::unique_ptr op); ArchResampling ResamplingMode(TensorUsage usage, OpType opType) const; diff --git a/ethosu/regor/test/test_tflite_supported_operators.cpp b/ethosu/regor/test/test_tflite_supported_operators.cpp index bbef8e5b..05aaf67e 100644 --- a/ethosu/regor/test/test_tflite_supported_operators.cpp +++ b/ethosu/regor/test/test_tflite_supported_operators.cpp @@ -501,7 +501,6 @@ TEST_CASE("Supported operators EthosU55") { std::set unsupported = { DataType::Int48, - DataType::Int64, DataType::UInt48, DataType::UInt64, DataType::QInt, @@ -537,6 +536,7 @@ TEST_CASE("Supported operators EthosU55") DataType::Int8, DataType::Int16, DataType::Int32, + DataType::Int64, }; for ( auto dtype : unsupported ) { diff --git a/ethosu/regor/tflite/tflite_supported_operators_u55.cpp b/ethosu/regor/tflite/tflite_supported_operators_u55.cpp index 5bad24e3..b8d3b77b 100644 --- a/ethosu/regor/tflite/tflite_supported_operators_u55.cpp +++ b/ethosu/regor/tflite/tflite_supported_operators_u55.cpp @@ -89,6 +89,7 @@ TfLiteSupportedOperatorsU55::TfLiteSupportedOperatorsU55(IArchitectureConstraint DataType::Int8, DataType::Int16, DataType::Int32, + DataType::Int64, // clang-format on }; _maxWeightSum8Bit = 127 * (1 << 16); -- GitLab