From 92013dad07e0b861e4388fde7bfa216e5e347839 Mon Sep 17 00:00:00 2001 From: Jacob Bohlin Date: Fri, 25 Apr 2025 08:44:07 +0100 Subject: [PATCH] MLBEDSW-8926 Port LSTM to Regor * Ported lowering of TFLite::UnidirectionalSequenceLstm to Regor. * Added reading of TFLite intermediate tensors. Added a new TensorUsage::Intermediate for these tensors. * Added logic to allocate tensors which point to the same buffer to the same address, enabling this to be controlled in GraphIR. * Added an optional Tag to the Buffer hash function in order to differentiate between multiple empty buffers which stem from different TFLite variable tensors. * Added missing rescaling for Sigmoid and Tanh when fused with Elementwise Add, Sub or Mul. * Added some limitations to persistent tensors: - They are now required to be in linear format. - They can not share memory with non-persistent tensors. * Made a small modification to graph traversal so that partial writes are processed in the order they are added to the graph. * Added supported operator checks for UnidirectionalSequenceLstm. Change-Id: I6bd08822a41dca48b3aa8091b07747327b37d68f Signed-off-by: Jacob Bohlin --- ethosu/regor/CMakeLists.txt | 1 + .../ethosu55/ethos_u55_scaling.cpp | 11 +- .../ethosu85/ethos_u85_scaling.cpp | 12 +- ethosu/regor/common/buffer_view.hpp | 30 +- ethosu/regor/compiler/attributes.cpp | 1 + ethosu/regor/compiler/attributes.hpp | 12 + ethosu/regor/compiler/graph.hpp | 20 +- ethosu/regor/compiler/lstm.cpp | 401 ++++++++++++++++++ ethosu/regor/compiler/lstm.hpp | 67 +++ ethosu/regor/compiler/operation.hpp | 2 + ethosu/regor/compiler/operation_util.hpp | 27 ++ ethosu/regor/compiler/quantization.hpp | 7 + ethosu/regor/compiler/scheduler.cpp | 4 +- ethosu/regor/compiler/scheduler_packing.cpp | 27 ++ ethosu/regor/compiler/scheduler_packing.hpp | 1 + .../regor/compiler/tflite_graph_optimiser.cpp | 11 + .../regor/compiler/tflite_graph_optimiser.hpp | 3 + ethosu/regor/test/test_passthrough.cpp | 18 +- ethosu/regor/tflite/tflite_mapping.cpp | 24 ++ .../regor/tflite/tflite_model_semantics.cpp | 8 +- ethosu/regor/tflite/tflite_reader.cpp | 78 +++- .../tflite/tflite_supported_operators.cpp | 48 +++ .../tflite/tflite_supported_operators.hpp | 1 + .../tflite/tflite_supported_operators_u55.cpp | 1 + .../tflite/tflite_supported_operators_u85.cpp | 1 + ethosu/regor/tflite/tflite_writer.cpp | 17 +- 26 files changed, 786 insertions(+), 47 deletions(-) create mode 100644 ethosu/regor/compiler/lstm.cpp create mode 100644 ethosu/regor/compiler/lstm.hpp diff --git a/ethosu/regor/CMakeLists.txt b/ethosu/regor/CMakeLists.txt index 80aa074f..f315b48a 100644 --- a/ethosu/regor/CMakeLists.txt +++ b/ethosu/regor/CMakeLists.txt @@ -288,6 +288,7 @@ regor_lib( "compiler/scheduler_packing.cpp" "compiler/scheduler_operation.cpp" "compiler/softmax.cpp" + "compiler/lstm.cpp" "compiler/tensor.cpp" "compiler/tensor_allocator.cpp" "compiler/tflite_graph_optimiser.cpp" diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_scaling.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_scaling.cpp index ce88b528..4af22546 100644 --- a/ethosu/regor/architecture/ethosu55/ethos_u55_scaling.cpp +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_scaling.cpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2021-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -75,12 +75,20 @@ void RescaleElementwise(HLCOperation *op) DataType ifmDataType = op->ifm[0].dataType; OpType opType = op->type; + double effectiveScale = 0; + if ( !op->subOps.empty() && (op->subOps[0].type == OpType::Sigmoid || op->subOps[0].type == OpType::Tanh) ) + { + // Adjust for Sigmoid/Tanh effective output scale. + effectiveScale = 1.0 / 0x3000; + } + bool allHaveScale = (!ifm1Quant->scales.empty() && !ofmQuant->scales.empty() && ifm2Quant != nullptr && !ifm2Quant->scales.empty()); if ( opType == OpType::Mul ) { if ( allHaveScale ) { + ofmScale = effectiveScale ? effectiveScale : ofmScale; outScale = ElementwiseMulScale(ifm1Scale, ifm2Scale, ofmScale); } } @@ -95,6 +103,7 @@ void RescaleElementwise(HLCOperation *op) } else if ( opType == OpType::Add || opType == OpType::Sub ) { + ofmScale = effectiveScale ? effectiveScale : ofmScale; int bitDepth = DataTypeSizeBits(ifmDataType); bool useAdvancedScaling = false; uint32_t opaScale = 1; diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_scaling.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_scaling.cpp index fafa2d57..b7171177 100644 --- a/ethosu/regor/architecture/ethosu85/ethos_u85_scaling.cpp +++ b/ethosu/regor/architecture/ethosu85/ethos_u85_scaling.cpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2021-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -132,6 +132,14 @@ void RescaleElementwise(HLCOperation *op) DataType ifmDataType = op->ifm[0].dataType; OpType opType = op->type; + + double effectiveScale = 0; + if ( !op->subOps.empty() && (op->subOps[0].type == OpType::Sigmoid || op->subOps[0].type == OpType::Tanh) ) + { + // Adjust for Sigmoid/Tanh effective output scale. + effectiveScale = 1.0 / 0x3000; + } + bool allHaveScale = (!ifm1Quant->scales.empty() && !ofmQuant->scales.empty() && ifm2Quant != nullptr && !ifm2Quant->scales.empty()); @@ -144,6 +152,7 @@ void RescaleElementwise(HLCOperation *op) { if ( allHaveScale ) { + ofmScale = effectiveScale ? effectiveScale : ofmScale; outScale = ElementwiseMulScale(ifm1Scale, ifm2Scale, ofmScale); } } @@ -159,6 +168,7 @@ void RescaleElementwise(HLCOperation *op) { if ( allHaveScale ) { + ofmScale = effectiveScale ? effectiveScale : ofmScale; AdvancedElementwiseAddSubScale(ifm1Scale, ifm2Scale, ofmScale, bitDepth, input1Scale, input2Scale, outScale); } } diff --git a/ethosu/regor/common/buffer_view.hpp b/ethosu/regor/common/buffer_view.hpp index 494172fc..0dc848e6 100644 --- a/ethosu/regor/common/buffer_view.hpp +++ b/ethosu/regor/common/buffer_view.hpp @@ -331,15 +331,27 @@ public: void Rehash() { - // Calculate MD5 hash of data, prefixed by the size of data - const auto buffer = const_cast(this); - std::string sizeStr("<"); - sizeStr += std::to_string(buffer->Size()); - sizeStr += '>'; - MD5 hash; - hash.Combine(reinterpret_cast(sizeStr.data()), int(sizeStr.size())); - hash.Combine(buffer->Data(), buffer->Size()); - hash.Get(_dataHash); + if ( Size() > 0 ) + { + // Calculate MD5 hash of data, prefixed by the size of data + std::string sizeStr("<"); + sizeStr += std::to_string(Size()); + sizeStr += '>'; + MD5 hash; + // Make sure the const overload of Data() is called + const uint8_t *data = std::as_const(*this).Data(); + hash.Combine(reinterpret_cast(sizeStr.data()), int(sizeStr.size())); + hash.Combine(data, Size()); + hash.Get(_dataHash); + } + else + { + // If the buffer is empty use the pointer to this buffer object as a hash to + // disambiguate between different empty buffers. + uintptr_t ptr = reinterpret_cast(this); + _dataHash.v32[0] = _dataHash.v32[1] = static_cast(ptr); + _dataHash.v32[2] = _dataHash.v32[3] = static_cast(ptr >> 32); + } } private: diff --git a/ethosu/regor/compiler/attributes.cpp b/ethosu/regor/compiler/attributes.cpp index 659745ab..367bc1e5 100644 --- a/ethosu/regor/compiler/attributes.cpp +++ b/ethosu/regor/compiler/attributes.cpp @@ -55,6 +55,7 @@ DynamicRef CreateAttribute(uint32_t reducedHash) CASE_MAKE_ATTR_INSTANCE(transpose_conv2d_attr_t); CASE_MAKE_ATTR_INSTANCE(while_attr_t); CASE_MAKE_ATTR_INSTANCE(mirror_pad_mode_attr_t); + CASE_MAKE_ATTR_INSTANCE(unidirectional_sequence_lstm_attr_t); default: assert(false && "No attribute has this reduced hash"); // Add a new XXX_attr_t struct to the header then diff --git a/ethosu/regor/compiler/attributes.hpp b/ethosu/regor/compiler/attributes.hpp index 4ce8260e..8f55caeb 100644 --- a/ethosu/regor/compiler/attributes.hpp +++ b/ethosu/regor/compiler/attributes.hpp @@ -280,6 +280,18 @@ struct mirror_pad_mode_attr_t END_FIELD_TABLE() }; +struct unidirectional_sequence_lstm_attr_t +{ + int cell_clip; + int projection_clip; + bool time_major; + BEGIN_FIELD_TABLE(unidirectional_sequence_lstm_attr_t) + ATTR_FIELD(cell_clip, 0) + ATTR_FIELD(projection_clip, 1) + ATTR_FIELD(time_major, 2) + END_FIELD_TABLE() +}; + #define REDUCED_HASH(hash) (hash & 0x000FFFFF) DynamicRef CreateAttribute(uint32_t hash); diff --git a/ethosu/regor/compiler/graph.hpp b/ethosu/regor/compiler/graph.hpp index c4c7bbd3..90b3d72b 100644 --- a/ethosu/regor/compiler/graph.hpp +++ b/ethosu/regor/compiler/graph.hpp @@ -151,6 +151,14 @@ public: void SetScheduledOrder(std::vector operations) { _opsInScheduledOrder = std::move(operations); } + // Traverse the graph in right-to-left reverse post-order but processing tensor writers left-to-right. + // This means in below graph, where A and B both write to the input tensor of C, A will be processed + // before B. + // A B + // \ / + // | + // C + // The rationale is to preserve the order that partial writes are added to the graph. template static void TraverseGraphFromEnd(const std::vector> &from, OPFUNC opFunc) { @@ -166,9 +174,10 @@ public: for ( const auto &tensor : from ) { - for ( const auto &op : tensor->Writers() ) + const auto &writers = tensor->Writers(); + for ( auto it = writers.crbegin(); it != writers.crend(); it++ ) { - stack.emplace(false, op); + stack.emplace(false, *it); } } @@ -189,11 +198,12 @@ public: stack.emplace(true, entry.op); for ( const auto &pair : entry.op->Inputs().pairs() ) { - for ( const auto &op : pair.second.tensor->Writers() ) + const auto &writers = pair.second.tensor->Writers(); + for ( auto it = writers.crbegin(); it != writers.crend(); it++ ) { - if ( visited.count(op.get()) == 0 ) + if ( visited.count(it->get()) == 0 ) { - stack.emplace(false, op); + stack.emplace(false, *it); } } } diff --git a/ethosu/regor/compiler/lstm.cpp b/ethosu/regor/compiler/lstm.cpp new file mode 100644 index 00000000..f112770e --- /dev/null +++ b/ethosu/regor/compiler/lstm.cpp @@ -0,0 +1,401 @@ +// +// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "lstm.hpp" + +#include "operation_util.hpp" +#include "quantization.hpp" + +namespace regor +{ + +static constexpr double Q0_15_SCALE = 1.0 / (1 << 15); +static constexpr double Q3_12_SCALE = 1.0 / (1 << 12); + +LSTM::LSTM(Operation *operation, OptimiserDatabase *db, Graph *graph) : _lstmOp(operation), _db(db), _graph(graph) +{ + assert(_lstmOp->Type() == OpType::UnidirectionalSequenceLstm); + + // Attributes + assert(operation->HasAttribute()); + auto *attr = operation->Attribute(); + _isTimeMajor = attr->time_major; + _cellClip = attr->cell_clip; + + // Input/Output + _ifmConn = _lstmOp->Input(TensorUsage::IFM); + _ofmConn = _lstmOp->Output(TensorUsage::OFM); + + // Input dimensions + Shape ifmShape = _ifmConn->shape; + _nFeature = ifmShape[-1]; + _nTime = ifmShape[_isTimeMajor ? 0 : 1]; + _nBatch = ifmShape[_isTimeMajor ? 1 : 0]; +} + +void LSTM::RecordOptimisation(Operation *op) +{ + if ( _db ) + { + _db->AddOptimised(_lstmOp, op); + } +} + +Operation *LSTM::ConvertOp() +{ + Operation *returnOp = _lstmOp; + int numBatches = _isTimeMajor ? 1 : _nBatch; + for ( int batch = 0; batch < numBatches; batch++ ) + { + TensorConnection *outputState = GetInitialState(TensorUsage::State, batch); + TensorConnection *cellState = GetInitialState(MakeTensorUsage(TensorUsage::State, 1), batch); + for ( int time = 0; time < _nTime; time++ ) + { + TensorConnection *feature = ExtractFeatureSlice(time, batch); + assert(feature); + std::tie(outputState, cellState) = Step(feature, outputState, cellState, time, batch); + returnOp = SetOutputWrite(outputState, time, batch); + } + } + + if ( returnOp != _lstmOp ) + { + _lstmOp->Disconnect(); + } + return returnOp; +} + +// Extract time and batch slice of the input tensor. +TensorConnection *LSTM::ExtractFeatureSlice(int time, int batch) +{ + std::shared_ptr featureTensor = _ifmConn->tensor->Clone(); + featureTensor->SetName(fmt::format("{0}_feauture_b{1}.t{2}", featureTensor->Name(), batch, time)); + featureTensor->SetStorageShape({(_isTimeMajor ? _nBatch : 1), _nFeature}); + auto op = std::make_shared(OpType::Slice); + + auto readShape = featureTensor->StorageShape(); + auto readOffset = _isTimeMajor ? Shape(time, 0, 0) : Shape(batch, time, 0); + auto *attr = op->Attribute(); + attr->size = readShape; + attr->begin = readOffset; + + op->CopyInput(TensorUsage::IFM, *_ifmConn); + op->ConnectOutput(TensorUsage::OFM, featureTensor).Set(_ifmConn->quantization); + RecordOptimisation(op.get()); + return op->Output(TensorUsage::OFM); +} + +// Get state tensor for provided state type and batch +TensorConnection *LSTM::GetInitialState(TensorUsage stateUsage, int batch) +{ + TensorConnection *stateConn = _lstmOp->Input(stateUsage); + if ( _isTimeMajor ) + { + // For time major, return the state tensor directly since all + // batches are calculated in the same step. + return stateConn; + } + else + { + // For batch major, return one batch slice of the state tensor. + // The tensor has to be cloned in order to resolve graph dependencies correctly but + // the clone will share the underlying buffer with the original state tensor which + // ensure they are allocated to the same address. + std::shared_ptr newStateTensor = stateConn->tensor->Clone(); + + // Set read/write shape to be one batch and read/write offset to the current batch. + TensorSlice slice({0, 0, batch, 0}, {1, 1, 1, stateConn->shape[-1]}); + const auto &stateQuant = stateConn->quantization; + + auto op = std::make_shared(OpType::MemoryCopy); + op->ConnectInput(TensorUsage::IFM, stateConn->tensor).Set(slice).Set(stateQuant); + op->ConnectOutput(TensorUsage::OFM, newStateTensor).Set(slice).Set(stateQuant); + + // Mark the cloned tensor as persistent to require linear format and avoid fusing with + // other tensors. + _graph->AddPersistent(newStateTensor); + RecordOptimisation(op.get()); + return op->Output(TensorUsage::OFM); + } +} + +// Setup the correct read shape and offset for reading from a state tensor. +void LSTM::SetStateRead(Operation *op, int batch) +{ + if ( !_isTimeMajor && _nBatch > 1 ) + { + Shape cellStateShape = _lstmOp->Input(MakeTensorUsage(TensorUsage::State, 1))->shape; + const Shape ifmShape = op->Input(TensorUsage::IFM)->shape; + op->Input(TensorUsage::IFM)->Set(cellStateShape).Set({{0, 0, batch, 0}, {1, 1, 1, ifmShape[-1]}}); + } +} + +// Write the state for the provided batch by pointing the operations ofm to the state tensor. +void LSTM::SetStateWrite(Operation *op, TensorUsage stateUsage, int batch) +{ + TensorConnection *stateConn = _lstmOp->Input(stateUsage); + + auto ofmConn = op->Output(TensorUsage::OFM); + auto ofmShape = Shape::PadAxes(ofmConn->shape, 4, 1); + + std::shared_ptr newStateTensor = stateConn->tensor->Clone(); + op->ConnectOutput(TensorUsage::OFM, newStateTensor).Set(stateConn->shape).Set(stateConn->quantization); + + if ( !_isTimeMajor && _nBatch > 1 ) + { + auto writeOffset = Shape(0, 0, batch, 0); + ofmConn->Set({writeOffset, ofmShape}); + } + + // Mark the cloned tensor as persistent to require linear format and avoid fusing with + // other tensors. + _graph->AddPersistent(newStateTensor); +} + +// Copy the output state to the time/batch slice of the final output. +Operation *LSTM::SetOutputWrite(TensorConnection *stateConn, int time, int batch) +{ + auto concatOp = std::make_shared(OpType::MemoryCopy); + + auto concatIfmConn = &concatOp->ConnectInput(TensorUsage::IFM, stateConn->tensor).Set(stateConn->shape).Set(_ofmConn->quantization); + + if ( !_isTimeMajor && _nBatch > 1 ) + { + Shape readOffset(0, 0, batch, 0); + Shape readSize(1, 1, 1, stateConn->shape[-1]); + concatIfmConn->Set({readOffset, readSize}); + } + + Shape writeOffset = _isTimeMajor ? Shape(0, time, 0, 0) : Shape(0, batch, time, 0); + Shape writeShape = _isTimeMajor ? Shape(1, 1, stateConn->shape[-2], stateConn->shape[-1]) : Shape(1, 1, 1, stateConn->shape[-1]); + concatOp->ConnectOutput(TensorUsage::OFM, _ofmConn->tensor) + .Set(_ofmConn->shape) + .Set(_ofmConn->quantization) + .Set({writeOffset, writeShape}) + .Set(RoundMode::NATURAL); + + RecordOptimisation(concatOp.get()); + return concatOp.get(); +} + +// Generate a gate for the provided input and weights +// Activation( Add( FullyConnected(input feature), FullyConnected(output state) ) ) +TensorConnection *LSTM::CalculateGate(const std::string &name, TensorConnection *featureConn, TensorConnection *stateConn, + TensorConnection *inputWeightConn, TensorConnection *inputBiasConn, TensorConnection *recurrentWeightConn, OpType activationType, int batch) +{ + // Setup fullyconnected output quantization + Quantization fcQuant; + fcQuant.type = QuantizationType::TFLITE; + fcQuant.scales = {Q3_12_SCALE}; + fcQuant.zeroPoints = {0}; + + Operation *inputFC = CreateFullyConnected(fmt::format("{0}_feature_fc", name), featureConn->tensor, + inputWeightConn->tensor, featureConn->quantization, inputWeightConn->quantization, fcQuant, + featureConn->SliceShape(), DataType::Int16, inputBiasConn->tensor, inputBiasConn->quantization); + TensorConnection *inputFCOfmConn = inputFC->Output(TensorUsage::OFM); + + Operation *recurrentFC = CreateFullyConnected(fmt::format("{0}_recurrent_fc", name), stateConn->tensor, + recurrentWeightConn->tensor, stateConn->quantization, recurrentWeightConn->quantization, fcQuant, + stateConn->SliceShape(), DataType::Int16); + SetStateRead(recurrentFC, batch); + TensorConnection *recurrentFCOfmConn = recurrentFC->Output(TensorUsage::OFM); + + Quantization addQuant; + addQuant.type = QuantizationType::TFLITE; + addQuant.scales = {1.0f}; + addQuant.zeroPoints = {0}; + Operation *add = CreateAdd(inputFCOfmConn->tensor, recurrentFCOfmConn->tensor, inputFCOfmConn->quantization, + recurrentFCOfmConn->quantization, addQuant); + + // Create activation function + Quantization activationQuant; + activationQuant.type = QuantizationType::TFLITE; + activationQuant.scales = {1.0f}; + activationQuant.zeroPoints = {0}; + + auto activation = std::make_shared(activationType); + auto addOfmTensor = add->Output(TensorUsage::OFM)->tensor; + + activation->ConnectInput(TensorUsage::IFM, addOfmTensor).Set(addQuant); + activation->ConnectOutput(TensorUsage::OFM, addOfmTensor->Clone()).Set(activationQuant); + + auto returnConn = activation->Output(TensorUsage::OFM); + if ( activationType == OpType::Sigmoid ) + { + // For Sigmoid we need to set the activation min/max values to match the possible range + // in the reference. The values below are the quantized min/max values that the reference + // can achive for the LUT based Sigmoid/Logistic. (The NPU does however have a larger range + // due to intermediate higher precision.) + auto clamp = std::make_shared(OpType::Clamp); + auto *attr = clamp->Attribute(); + attr->max = Quantize(32757.0f, activationQuant); + attr->min = Quantize(11.0f, activationQuant); + + // Copying the input and output of the Add means the Clamp will also write to the cell state. + clamp->CopyInput(TensorUsage::IFM, *returnConn); + clamp->CopyOutput(TensorUsage::OFM, *returnConn); + + RecordOptimisation(clamp.get()); + returnConn = clamp->Output(TensorUsage::OFM); + } + + RecordOptimisation(inputFC); + RecordOptimisation(recurrentFC); + RecordOptimisation(add); + RecordOptimisation(activation.get()); + return returnConn; +} + +// Calculate and update the cell state from the provided gates +// Clip( Add( Mul( cell state, forget gate ), Mul( cell gate, input gate ) ) ) +TensorConnection *LSTM::CalculateCellState(TensorConnection *cellStateConn, TensorConnection *inputGateConn, + TensorConnection *forgetGateConn, TensorConnection *cellGateConn, int time, int batch) +{ + const Quantization &cellStateQuant = cellStateConn->quantization; + double cellStateScale = cellStateQuant.scales[0].Dequantize(); + // Calculate explicit scales based on the cell state quantization + Quantization mulCFQuant; + mulCFQuant.type = QuantizationType::TFLITE; + mulCFQuant.scales = {ElementwiseMulScale(cellStateScale, Q0_15_SCALE, cellStateScale)}; + mulCFQuant.zeroPoints = {cellStateConn->quantization.zeroPoints[0]}; + // Create Mul(cell_state, forget_gate) + Operation *mulCF = CreateMul(cellStateConn->tensor, forgetGateConn->tensor, mulCFQuant, mulCFQuant, mulCFQuant, + DataType::None, &forgetGateConn->shape, &forgetGateConn->shape); + SetStateRead(mulCF, batch); + + // Calculate explicit scales based on the cell state quantization + Quantization mulCIQuant; + mulCIQuant.type = QuantizationType::TFLITE; + mulCIQuant.scales = {ElementwiseMulScale(Q0_15_SCALE, Q0_15_SCALE, cellStateScale)}; + mulCIQuant.zeroPoints = {cellStateConn->quantization.zeroPoints[0]}; + // Create Mul(cell_gate, input_gate) + Operation *mulCI = CreateMul(cellGateConn->tensor, inputGateConn->tensor, mulCIQuant, mulCIQuant, mulCIQuant); + + // Create Add with cell state quantization + Operation *add = CreateAdd(mulCF->Output(TensorUsage::OFM)->tensor, mulCI->Output(TensorUsage::OFM)->tensor, + cellStateQuant, cellStateQuant, cellStateQuant); + // Redirect the ofm of Add to cell state. + SetStateWrite(add, MakeTensorUsage(TensorUsage::State, 1), batch); + + RecordOptimisation(mulCF); + RecordOptimisation(mulCI); + RecordOptimisation(add); + + TensorConnection *returnConn = add->Output(TensorUsage::OFM); + if ( _cellClip != 0 ) + { + // If the cell clip attribute is non-zero the output needs to be clamped. + auto clamp = std::make_shared(OpType::Clamp); + auto *attr = clamp->Attribute(); + attr->max = Quantize(static_cast(_cellClip), cellStateQuant); + attr->min = Quantize(static_cast(-_cellClip), cellStateQuant); + + // Copying the input and output of the Add means the Clamp will also write to the cell state. + clamp->CopyInput(TensorUsage::IFM, *returnConn); + clamp->CopyOutput(TensorUsage::OFM, *returnConn); + + RecordOptimisation(clamp.get()); + returnConn = clamp->Output(TensorUsage::OFM); + } + + return returnConn; +} + +// Calculate and update the output state from the provided gate output +// Mul( Tanh(cell state), output gate ) +TensorConnection *LSTM::CalculateOutputState(TensorConnection *outputGateConn, TensorConnection *cellStateConn, int time, int batch) +{ + // Setup tanh quantization + Quantization tanhQuant; + tanhQuant.type = QuantizationType::TFLITE; + tanhQuant.scales = {QuantizedScale(Q0_15_SCALE)}; + tanhQuant.zeroPoints = {0}; + + // Create tanh(cell state) + auto tanh = std::make_shared(OpType::Tanh); + tanh->ConnectInput(TensorUsage::IFM, cellStateConn->tensor).Set(cellStateConn->shape).Set(cellStateConn->quantization); + + // Tanh reads from the cell state. This may set an ifm slice which the ofm shape needs to honor. + SetStateRead(tanh.get(), batch); + auto tanhIfmConn = tanh->Input(TensorUsage::IFM); + Shape tanhOfmShape = tanhIfmConn->SliceShape(); + // Create a new tensor for ofm instead of cloning, this ensures that the tanh output will not + // overwrite the cell state. + auto ofmName = fmt::format("{0}_tanh_b{1}.t{2}", cellStateConn->tensor->Name(), batch, time); + auto tanhOfm = std::make_shared(ofmName, cellStateConn->tensor->Type(), tanhOfmShape); + tanh->ConnectOutput(TensorUsage::OFM, tanhOfm).Set(tanhQuant); + + // Create Mul( Tanh, output gate ) + // Ofm quantization is based on the hidden scale. + double hiddenScale = _lstmOp->Input(MakeTensorUsage(TensorUsage::Scratch, 4))->quantization.scales[0].Dequantize(); + auto mulQuant = _ofmConn->quantization; + mulQuant.type = QuantizationType::TFLITE; + mulQuant.scales = {ElementwiseMulScale(Q0_15_SCALE, Q0_15_SCALE, hiddenScale)}; + Operation *mul = CreateMul(tanhOfm, outputGateConn->tensor, tanhQuant, tanhQuant, mulQuant, _ifmConn->tensor->Type()); + + // Save new output state + SetStateWrite(mul, TensorUsage::State, batch); + + RecordOptimisation(tanh.get()); + RecordOptimisation(mul); + return mul->Output(TensorUsage::OFM); +} + + +// Generate one step of the LSTM for the provided feature, batch and time +std::pair LSTM::Step(TensorConnection *featureConn, + TensorConnection *outputStateConn, TensorConnection *cellStateConn, int time, int batch) +{ + assert(outputStateConn && cellStateConn); + auto suffix = fmt::format("b{0}.t{1}", batch, time); + + auto inputToInputWeightConn = _lstmOp->Input(MakeTensorUsage(TensorUsage::Weights, 0)); + auto recurrentToInputWeightConn = _lstmOp->Input(MakeTensorUsage(TensorUsage::Weights, 4)); + auto inputBiasConn = _lstmOp->Input(MakeTensorUsage(TensorUsage::Scales, 0)); + TensorConnection *inputGate = CalculateGate(fmt::format("input_gate_{0}", suffix), featureConn, outputStateConn, + inputToInputWeightConn, inputBiasConn, recurrentToInputWeightConn, OpType::Sigmoid, batch); + + auto inputToForgetWeightConn = _lstmOp->Input(MakeTensorUsage(TensorUsage::Weights, 1)); + auto recurrentToForgetWeightConn = _lstmOp->Input(MakeTensorUsage(TensorUsage::Weights, 5)); + auto forgetBiasConn = _lstmOp->Input(MakeTensorUsage(TensorUsage::Scales, 1)); + TensorConnection *forgetGate = CalculateGate(fmt::format("forget_gate_{0}", suffix), featureConn, outputStateConn, + inputToForgetWeightConn, forgetBiasConn, recurrentToForgetWeightConn, OpType::Sigmoid, batch); + + auto inputToCellWeightConn = _lstmOp->Input(MakeTensorUsage(TensorUsage::Weights, 2)); + auto recurrentToCellWeightConn = _lstmOp->Input(MakeTensorUsage(TensorUsage::Weights, 6)); + auto cellBiasConn = _lstmOp->Input(MakeTensorUsage(TensorUsage::Scales, 2)); + TensorConnection *cellGate = CalculateGate(fmt::format("cell_gate_{0}", suffix), featureConn, outputStateConn, + inputToCellWeightConn, cellBiasConn, recurrentToCellWeightConn, OpType::Tanh, batch); + + // Calculate and update cell state + cellStateConn = CalculateCellState(cellStateConn, inputGate, forgetGate, cellGate, time, batch); + + auto inputToOutputWeightConn = _lstmOp->Input(MakeTensorUsage(TensorUsage::Weights, 3)); + auto recurrentToOutputWeightConn = _lstmOp->Input(MakeTensorUsage(TensorUsage::Weights, 7)); + auto outputBiasConn = _lstmOp->Input(MakeTensorUsage(TensorUsage::Scales, 3)); + TensorConnection *outputGate = CalculateGate(fmt::format("output_gate_{0}", suffix), featureConn, outputStateConn, + inputToOutputWeightConn, outputBiasConn, recurrentToOutputWeightConn, OpType::Sigmoid, batch); + + // Calculate and update ouput state + assert(cellStateConn); + outputStateConn = CalculateOutputState(outputGate, cellStateConn, time, batch); + + return {outputStateConn, cellStateConn}; +} + +} // namespace regor diff --git a/ethosu/regor/compiler/lstm.hpp b/ethosu/regor/compiler/lstm.hpp new file mode 100644 index 00000000..fbe2eb1d --- /dev/null +++ b/ethosu/regor/compiler/lstm.hpp @@ -0,0 +1,67 @@ +// +// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "graph_optimiser.hpp" +#include "operation.hpp" + +namespace regor +{ + +/// +/// TFLite Graph optimiser LSTM rewriter +/// +class LSTM +{ +private: + Operation *_lstmOp = nullptr; + OptimiserDatabase *_db = nullptr; + Graph *_graph = nullptr; + + // Dimensions + int _nFeature, _nTime, _nBatch; + // Attributes + int _cellClip; + bool _isTimeMajor; + // Input/Output + TensorConnection *_ifmConn = nullptr; + TensorConnection *_ofmConn = nullptr; + +public: + LSTM(Operation *operation, OptimiserDatabase *db, Graph *graph); + Operation *ConvertOp(); + +private: + void RecordOptimisation(Operation *op); + TensorConnection *ExtractFeatureSlice(int time, int batch); + TensorConnection *GetInitialState(TensorUsage stateUsage, int batch); + void SetStateRead(Operation *op, int batch); + void SetStateWrite(Operation *op, TensorUsage stateUsage, int batch); + Operation *SetOutputWrite(TensorConnection *stateConn, int time, int batch); + TensorConnection *CalculateGate(const std::string &name, TensorConnection *featureConn, TensorConnection *stateConn, + TensorConnection *inputWeightConn, TensorConnection *inputBiasConn, TensorConnection *recurrentWeightConn, + OpType activationType, int batch); + TensorConnection *CalculateCellState(TensorConnection *cellStateConn, TensorConnection *inputGateConn, + TensorConnection *forgetGateConn, TensorConnection *cellGateConn, int time, int batch); + TensorConnection *CalculateOutputState(TensorConnection *outputGateConn, TensorConnection *cellStateConn, int time, int batch); + std::pair Step(TensorConnection *featureConn, + TensorConnection *outputStateConn, TensorConnection *cellStateConn, int time, int batch); +}; + +} // namespace regor diff --git a/ethosu/regor/compiler/operation.hpp b/ethosu/regor/compiler/operation.hpp index c2a72a9e..bd3283be 100644 --- a/ethosu/regor/compiler/operation.hpp +++ b/ethosu/regor/compiler/operation.hpp @@ -122,6 +122,8 @@ struct TensorConnection rounding = r; return *this; } + + const Shape &SliceShape() const { return slice.shape ? slice.shape : shape; } }; diff --git a/ethosu/regor/compiler/operation_util.hpp b/ethosu/regor/compiler/operation_util.hpp index d3fc1edc..b77e8ed4 100644 --- a/ethosu/regor/compiler/operation_util.hpp +++ b/ethosu/regor/compiler/operation_util.hpp @@ -261,6 +261,33 @@ inline Operation *CreateRescaleAdd(const std::shared_ptr &ifm, const std return op; } +inline Operation *CreateFullyConnected(const std::string &name, const std::shared_ptr &ifm, + const std::shared_ptr &weights, const Quantization &ifmQuantization, const Quantization &weightQuantization, + const Quantization &ofmQuantization, const Shape ifmShape, DataType ofmDtype = DataType::None, + std::shared_ptr bias = nullptr, const Quantization &biasQuantization = Quantization::Unit()) +{ + int numOutputs = weights->StorageShape()[0]; + + auto op = std::make_shared(OpType::FullyConnected); + op->ConnectInput(TensorUsage::IFM, ifm).Set(ifmShape).Set(ifmQuantization); + op->ConnectInput(TensorUsage::Weights, weights).Set(weights->StorageShape()).Set(weightQuantization); + + if ( bias == nullptr ) + { + DataType biasType = ifm->Type() == DataType::Int16 ? DataType::Int64 : DataType::Int32; + std::vector zeroBuf(DataTypeStorageSizeBytes(biasType, 1), 0); + bias = CreateConstTensor(name + std::string("_bias"), biasType, std::make_shared(std::move(zeroBuf))); + } + + op->ConnectInput(TensorUsage::Scales, bias).Set(Shape(numOutputs)).Set(biasQuantization); + + // Setup OFM + if ( ofmDtype == DataType::None ) ofmDtype = ifm->Type(); + auto ofm = std::make_shared(name + "_ofm", ofmDtype, Shape(ifmShape[0], numOutputs)); + op->ConnectOutput(TensorUsage::OFM, ofm).Set(ofmQuantization); + return op.get(); +} + inline TransposeType CalculateTransposeType(const Operation &operation) { const auto *paramsConn = operation.Input(TensorUsage::Params); diff --git a/ethosu/regor/compiler/quantization.hpp b/ethosu/regor/compiler/quantization.hpp index dd054078..19c8caf4 100644 --- a/ethosu/regor/compiler/quantization.hpp +++ b/ethosu/regor/compiler/quantization.hpp @@ -89,4 +89,11 @@ public: } }; +inline int64_t Quantize(float value, const Quantization &quant) +{ + float scale = quant.scales.empty() ? 1.0f : float(quant.scales[0].Dequantize()); + int64_t zp = quant.zeroPoints.empty() ? 0 : quant.zeroPoints[0]; + return zp + int64_t(std::round(double(value / scale))); +} + } // namespace regor diff --git a/ethosu/regor/compiler/scheduler.cpp b/ethosu/regor/compiler/scheduler.cpp index df9c8d64..9beb52fd 100644 --- a/ethosu/regor/compiler/scheduler.cpp +++ b/ethosu/regor/compiler/scheduler.cpp @@ -206,8 +206,8 @@ int Scheduler::UpdateSchedulerTensor(TensorUsage usage, SchedulerConnection *con conn->requireFullTensor = true; } - // Force linear format for read only tensors - if ( tensor->IsConstant() ) + // Force linear format for read only or persistent tensors + if ( tensor->IsConstant() || tensor->isPersistent ) { tensor->needsLinearFormat = true; } diff --git a/ethosu/regor/compiler/scheduler_packing.cpp b/ethosu/regor/compiler/scheduler_packing.cpp index a6ff98b3..550f8df2 100644 --- a/ethosu/regor/compiler/scheduler_packing.cpp +++ b/ethosu/regor/compiler/scheduler_packing.cpp @@ -429,6 +429,18 @@ int SchedulerPacking::CanPack(const SchedulerOperation *schedOp, const Scheduler return 0; } + if ( schedOp->Type() == OpType::FullyConnected ) + { + return 0; + } + + // Do not pack persistent tensors with non persistent tensors + // if ( ifmTensor->isPersistent != prevOFM->isPersistent ) + if ( prevOFM->isPersistent != nextOp->OFM()->tensor->isPersistent ) + { + return 0; + } + // Previous op in execution order doesn't connect to this one if ( prevOFM != ifmTensor && prevOFM != ifm2Tensor ) { @@ -497,6 +509,21 @@ void SchedulerPacking::InitSchedulerTensor(SchedulerTensor *schedTensor, Tensor schedTensor->isGraphOutput = graph->IsOutput(tensor); schedTensor->isPersistent = graph->IsPersistent(tensor); schedTensor->uid = tensor->Uid(); + if ( tensor->View().HasBuffer() ) + { + // Assign equivalenceIds based on the underlying buffer of the GraphIR tensor (if present). + // This ensures that all tensors sharing a buffer will be allocated to the same memory. + auto buffer = tensor->View().Buffer(); + auto eqId = _bufferEquivalenceIdMap.find(buffer->Hash()); + if ( eqId == _bufferEquivalenceIdMap.end() ) + { + _bufferEquivalenceIdMap.emplace(buffer->Hash(), schedTensor->equivalenceId); + } + else + { + schedTensor->equivalenceId = eqId->second; + } + } } std::unique_ptr SchedulerPacking::MakeSchedulerOperation(Operation *op, const Graph *graph) diff --git a/ethosu/regor/compiler/scheduler_packing.hpp b/ethosu/regor/compiler/scheduler_packing.hpp index 40621851..eca43ca8 100644 --- a/ethosu/regor/compiler/scheduler_packing.hpp +++ b/ethosu/regor/compiler/scheduler_packing.hpp @@ -50,6 +50,7 @@ protected: bool _disableChaining = false; std::vector> _schedList; std::unordered_map> _tensorMap; + std::unordered_map _bufferEquivalenceIdMap; public: SchedulerPacking(Architecture *arch, bool disableChaining); diff --git a/ethosu/regor/compiler/tflite_graph_optimiser.cpp b/ethosu/regor/compiler/tflite_graph_optimiser.cpp index 4ac1eaa7..e3ecd8c7 100644 --- a/ethosu/regor/compiler/tflite_graph_optimiser.cpp +++ b/ethosu/regor/compiler/tflite_graph_optimiser.cpp @@ -26,6 +26,7 @@ #include "common/transpose_type.hpp" #include "graph.hpp" #include "graph_optimiser.hpp" +#include "lstm.hpp" #include "op_type.hpp" #include "operation.hpp" #include "optimiser_utils.hpp" @@ -1701,6 +1702,16 @@ Operation *TFLiteGraphOptimiser::ConvertSoftmaxOps(Graph *const graph, Operation return _softmax->ConvertOp(operation); } +Operation *TFLiteGraphOptimiser::ConvertLstmOps(Graph *const graph, Operation *const operation) +{ + if ( operation->Type() == OpType::UnidirectionalSequenceLstm ) + { + auto lstmLowering = LSTM(operation, _db, graph); + return lstmLowering.ConvertOp(); + } + return operation; +} + Operation *TFLiteGraphOptimiser::ConvertMeanOps(Graph *const, Operation *const operation) { auto returnOp = operation; diff --git a/ethosu/regor/compiler/tflite_graph_optimiser.hpp b/ethosu/regor/compiler/tflite_graph_optimiser.hpp index 72f90b0b..30710470 100644 --- a/ethosu/regor/compiler/tflite_graph_optimiser.hpp +++ b/ethosu/regor/compiler/tflite_graph_optimiser.hpp @@ -131,6 +131,8 @@ private: Operation *ConvertSoftmaxOps(Graph *const graph, Operation *const operation); + Operation *ConvertLstmOps(Graph *const graph, Operation *const operation); + Operation *ConvertMeanOps(Graph *const, Operation *const operation); // Converts int8/uint8 Sigmoid and Tanh to a LUT based solution @@ -253,6 +255,7 @@ public: &TFLiteGraphOptimiser::ConvertLogToLUT, &TFLiteGraphOptimiser::ConvertTanhSigmoidToLUT, &TFLiteGraphOptimiser::ConvertSoftmaxOps, + &TFLiteGraphOptimiser::ConvertLstmOps, &TFLiteGraphOptimiser::ReplacePadByExplicitPadding, &TFLiteGraphOptimiser::ConvertMeanOps, &TFLiteGraphOptimiser::ConvertPrelu, diff --git a/ethosu/regor/test/test_passthrough.cpp b/ethosu/regor/test/test_passthrough.cpp index 0698ead6..10c13f6f 100644 --- a/ethosu/regor/test/test_passthrough.cpp +++ b/ethosu/regor/test/test_passthrough.cpp @@ -515,7 +515,17 @@ TEST_CASE("passthrough") } { - // Generate simple output tensor + // Generate intermediate tensor (tensor index 4) + // Intermediates cannot be constant and must use buffer 0 + const std::vector shape = {1, 11, 11, 3}; + const tflite::TensorType type = tflite::TensorType::INT16; + const int bufferIndex = 0; + const std::string name = "intermediate"; + tensors.push_back(tflite::CreateTensorDirect(fbb, &shape, type, bufferIndex, name.c_str())); + } + + { + // Generate simple output tensor (tensor index 5) const std::vector shape = {1, 11, 11, 3}; const tflite::TensorType type = tflite::TensorType::FLOAT32; const int bufferIndex = 0; @@ -536,10 +546,10 @@ TEST_CASE("passthrough") // Generate 1 operator const uint32_t opcodeIndex = 0; const std::vector inputs = {0, 1, 2, 3}; - const std::vector outputs = {4}; + const std::vector intermediates = {4}; + const std::vector outputs = {5}; const std::vector customOptions = random_vector(5); const std::vector mutatingVariableInputs = random_vector(4); - const std::vector intermediates = random_vector(4); // Generate builtin_options or builtin_options2 flatbuffers::Offset<> builtinOptions = 0; @@ -567,7 +577,7 @@ TEST_CASE("passthrough") { // Generate 1 subgraph const std::vector inputs = {0 /* ifm0 */}; - const std::vector outputs = {4 /* ofm */}; + const std::vector outputs = {5 /* ofm */}; const char *name = "subgraph1"; subgraphs.push_back(tflite::CreateSubGraphDirect(fbb, &tensors, &inputs, &outputs, &operations, name)); } diff --git a/ethosu/regor/tflite/tflite_mapping.cpp b/ethosu/regor/tflite/tflite_mapping.cpp index 27c31c39..f4b4eec0 100644 --- a/ethosu/regor/tflite/tflite_mapping.cpp +++ b/ethosu/regor/tflite/tflite_mapping.cpp @@ -630,8 +630,32 @@ const std::multimap TfLiteMapping::_inputTensorIndices = { {OpType::Transpose, TensorUsage::IFM0}, {OpType::Transpose, TensorUsage::Params}, {OpType::Unique, TensorUsage::IFM0}, + // LSTM {OpType::UnidirectionalSequenceLstm, TensorUsage::IFM0}, {OpType::UnidirectionalSequenceLstm, TensorUsage::Weights}, + {OpType::UnidirectionalSequenceLstm, MakeTensorUsage(TensorUsage::Weights, 1)}, + {OpType::UnidirectionalSequenceLstm, MakeTensorUsage(TensorUsage::Weights, 2)}, + {OpType::UnidirectionalSequenceLstm, MakeTensorUsage(TensorUsage::Weights, 3)}, + {OpType::UnidirectionalSequenceLstm, MakeTensorUsage(TensorUsage::Weights, 4)}, + {OpType::UnidirectionalSequenceLstm, MakeTensorUsage(TensorUsage::Weights, 5)}, + {OpType::UnidirectionalSequenceLstm, MakeTensorUsage(TensorUsage::Weights, 6)}, + {OpType::UnidirectionalSequenceLstm, MakeTensorUsage(TensorUsage::Weights, 7)}, + {OpType::UnidirectionalSequenceLstm, MakeTensorUsage(TensorUsage::Weights, 8)}, + {OpType::UnidirectionalSequenceLstm, MakeTensorUsage(TensorUsage::Weights, 9)}, + {OpType::UnidirectionalSequenceLstm, MakeTensorUsage(TensorUsage::Weights, 10)}, + {OpType::UnidirectionalSequenceLstm, TensorUsage::Scales}, + {OpType::UnidirectionalSequenceLstm, MakeTensorUsage(TensorUsage::Scales, 1)}, + {OpType::UnidirectionalSequenceLstm, MakeTensorUsage(TensorUsage::Scales, 2)}, + {OpType::UnidirectionalSequenceLstm, MakeTensorUsage(TensorUsage::Scales, 3)}, + {OpType::UnidirectionalSequenceLstm, MakeTensorUsage(TensorUsage::Weights, 11)}, + {OpType::UnidirectionalSequenceLstm, MakeTensorUsage(TensorUsage::Scales, 4)}, + {OpType::UnidirectionalSequenceLstm, TensorUsage::State}, + {OpType::UnidirectionalSequenceLstm, MakeTensorUsage(TensorUsage::State, 1)}, + {OpType::UnidirectionalSequenceLstm, MakeTensorUsage(TensorUsage::Scales, 5)}, + {OpType::UnidirectionalSequenceLstm, MakeTensorUsage(TensorUsage::Scales, 6)}, + {OpType::UnidirectionalSequenceLstm, MakeTensorUsage(TensorUsage::Scales, 7)}, + {OpType::UnidirectionalSequenceLstm, MakeTensorUsage(TensorUsage::Scales, 8)}, + // RNN {OpType::UnidirectionalSequenceRnn, TensorUsage::IFM0}, {OpType::UnidirectionalSequenceRnn, TensorUsage::Weights}, {OpType::Unpack, TensorUsage::IFM0}, diff --git a/ethosu/regor/tflite/tflite_model_semantics.cpp b/ethosu/regor/tflite/tflite_model_semantics.cpp index ca33f241..c27b97c7 100644 --- a/ethosu/regor/tflite/tflite_model_semantics.cpp +++ b/ethosu/regor/tflite/tflite_model_semantics.cpp @@ -245,9 +245,11 @@ void ConstraintEmptyConstTensors(const Model &m_model) { auto tensor = tensors[BoundsCheckedIndex(input, tensors)]; auto buffer = buffers[BoundsCheckedIndex(tensor->buffer(), buffers)]; - // Buffer 0 is a special buffer that is used for empty tensors - if ( (tensor->buffer() > 0 && (!buffer->data() || buffer->data()->size() == 0) && buffer->offset() <= 1) || - (buffer->offset() > 1 && buffer->size() == 0) ) + // Buffer 0 is a special buffer that is used for empty tensors. + // Variable tensors are also empty but are not forced to use Buffer 0. + if ( !tensor->is_variable() && + ((tensor->buffer() > 0 && (!buffer->data() || buffer->data()->size() == 0) && buffer->offset() <= 1) || + (buffer->offset() > 1 && buffer->size() == 0)) ) { std::string constraint = "Constant tensors must not have empty buffers"; std::string extra = "Found Constant Tensor with empty buffer"; diff --git a/ethosu/regor/tflite/tflite_reader.cpp b/ethosu/regor/tflite/tflite_reader.cpp index 9710d50a..d0e17bcf 100644 --- a/ethosu/regor/tflite/tflite_reader.cpp +++ b/ethosu/regor/tflite/tflite_reader.cpp @@ -90,6 +90,27 @@ static void SetKernel(const std::shared_ptr &operation, const Point2i operation->SetKernel(std::move(kernel)); } +static void ReshapeFullyConnectedWeights(const std::shared_ptr &operation, TensorUsage weightUsage) +{ + auto weight_tensor = operation->Input(weightUsage)->tensor; + if ( weight_tensor->AxisOrder() == AxisOrder::Unknown ) + { + const auto &shape = weight_tensor->StorageShape(); + // Reshape weight tensor from (num_outputs, ..., num_inputs) to (num_outputs, 1, 1, num_inputs) + if ( shape.Size() >= 2 && shape.Elements() == (shape[0] * shape[-1]) ) + { + weight_tensor->Reshape(Shape(shape[0], 1, 1, shape[-1])); + weight_tensor->SetAxisOrder(AxisOrder::OHWI); + operation->Input(weightUsage)->shape = weight_tensor->StorageShape(); + } + } + else + { + // Weight tensor has already been reshaped + assert(weight_tensor->AxisOrder() == AxisOrder::OHWI); + } +} + const tflite::Model *TfLiteReader::LoadModel(const void *input, size_t size) { const uint8_t *buffer = static_cast(input); @@ -192,6 +213,7 @@ void TfLiteReader::LoadGraphs(const uint8_t *input, const tflite::Model *model, assert(tflite_inputs); auto tflite_outputs = tflite_operator->outputs(); assert(tflite_outputs); + auto tflite_intermediates = tflite_operator->intermediates(); const auto &input_tensors = *tflite_inputs; // A vector of indices into the `tensors` vector int indirect_index = 0; // An index into `input_tensors` int ifm_count = 0; @@ -242,6 +264,19 @@ void TfLiteReader::LoadGraphs(const uint8_t *input, const tflite::Model *model, placeholder.push_back(std::move(tensor)); } + if ( tflite_intermediates ) + { + // Connect operation to its intermediate tensors. They are added as inputs with usage Intermediate. + int intermediate_count = 0; + for ( const int tensor_index : *tflite_intermediates ) + { + const auto &intermediate = tensors.at(tensor_index); + assert(tensorQuantization.count(intermediate->Uid()) > 0); + operation->ConnectInput(MakeTensorUsage(TensorUsage::Scratch, intermediate_count++), intermediate) + .Set(tensorQuantization[intermediate->Uid()]); + } + } + // Connect operation to its output tensors int ofm_count = 0; for ( const int tensor_index : *tflite_outputs ) @@ -430,6 +465,14 @@ std::shared_ptr TfLiteReader::ParseTensor(const tflite::Tensor *tflite_t LOG_WARN("Tensor '{}' contains sparsity information, which is not supported and will be ignored.\n", name); } + if ( tflite_tensor->is_variable() ) + { + // Create an empty buffer for variable tensor + assert(buffer == nullptr && "Unexpected buffer for variable tensor!"); + auto emptyBuffer = std::make_shared(std::vector{}); + tensor->SetBuffer(emptyBuffer); + } + tensor->SetPassthrough(tflite_tensor); return tensor; @@ -510,23 +553,8 @@ void TfLiteReader::ParseOperatorOptions( const auto options = GetBuiltinOptions(tflite_operator); activation_function = options->fused_activation_function(); // TODO: Are `weights_format`, `keep_num_dims` or `asymmetric_quantize_inputs` used? + ReshapeFullyConnectedWeights(operation, TensorUsage::Weights); auto weight_tensor = operation->Input(TensorUsage::Weights)->tensor; - if ( weight_tensor->AxisOrder() == AxisOrder::Unknown ) - { - const auto &shape = weight_tensor->StorageShape(); - // Reshape weight tensor from (num_outputs, ..., num_inputs) to (num_outputs, 1, 1, num_inputs) - if ( shape.Size() >= 2 && shape.Elements() == (shape[0] * shape[-1]) ) - { - weight_tensor->Reshape(Shape(shape[0], 1, 1, shape[-1])); - weight_tensor->SetAxisOrder(AxisOrder::OHWI); - operation->Input(TensorUsage::Weights)->shape = weight_tensor->StorageShape(); - } - } - else - { - // Weight tensor has already been reshaped - assert(weight_tensor->AxisOrder() == AxisOrder::OHWI); - } if ( operation->Input(TensorUsage::Scales) == nullptr ) { // Op has no bias; add bias tensor filled with zeros @@ -698,6 +726,23 @@ void TfLiteReader::ParseOperatorOptions( } break; + case tflite::BuiltinOptions::UnidirectionalSequenceLSTMOptions: + { + const auto options = GetBuiltinOptions(tflite_operator); + operation->Attribute()->cell_clip = options->cell_clip(); + operation->Attribute()->projection_clip = options->proj_clip(); + operation->Attribute()->time_major = options->time_major(); + + for ( int i = 0; i < 12; i++ ) + { + if ( operation->Input(MakeTensorUsage(TensorUsage::Weights, i)) ) + { + ReshapeFullyConnectedWeights(operation, MakeTensorUsage(TensorUsage::Weights, i)); + } + } + } + break; + case tflite::BuiltinOptions::ResizeBilinearOptions: case tflite::BuiltinOptions::ResizeNearestNeighborOptions: break; @@ -768,7 +813,6 @@ void TfLiteReader::ParseOperatorOptions( case tflite::BuiltinOptions::FillOptions: case tflite::BuiltinOptions::BidirectionalSequenceLSTMOptions: case tflite::BuiltinOptions::BidirectionalSequenceRNNOptions: - case tflite::BuiltinOptions::UnidirectionalSequenceLSTMOptions: case tflite::BuiltinOptions::FloorModOptions: case tflite::BuiltinOptions::RangeOptions: case tflite::BuiltinOptions::SquaredDifferenceOptions: diff --git a/ethosu/regor/tflite/tflite_supported_operators.cpp b/ethosu/regor/tflite/tflite_supported_operators.cpp index e0218087..f30933c1 100644 --- a/ethosu/regor/tflite/tflite_supported_operators.cpp +++ b/ethosu/regor/tflite/tflite_supported_operators.cpp @@ -950,6 +950,53 @@ bool TfLiteSupportedOperators::ConstraintLog(const Operation *op) return true; } +bool TfLiteSupportedOperators::ConstraintLSTM(const Operation *op) +{ + OpType opType = op->Type(); + if ( opType != OpType::UnidirectionalSequenceLstm ) + { + return true; + } + + for ( int i = 0; i <= 7; i++ ) + { + // Check that all the gate weights are present. If they are not it's either invalid or using Couple + // Input and Forget Gate (CIFG), where the input gate is computed implicitly from the forget gate, + // which is not supported. + if ( op->Input(MakeTensorUsage(TensorUsage::Weights, i)) == nullptr ) + { + Failure(op, "Missing gate weight tensor", "LSTM with implicit gate calculation is not supported"); + return false; + } + } + + for ( int i = 8; i <= 10; i++ ) + { + if ( op->Input(MakeTensorUsage(TensorUsage::Weights, i)) ) + { + Failure(op, "Peephole weight tensor present", "Peephole LSTM variant is not supported"); + return false; + } + } + + if ( op->Input(MakeTensorUsage(TensorUsage::Weights, 11)) || op->Input(MakeTensorUsage(TensorUsage::Scales, 4)) ) + { + Failure(op, "Projection weight or bias tensor present", "LSTM with projection is not supported"); + return false; + } + + for ( int i = 5; i <= 8; i++ ) + { + if ( op->Input(MakeTensorUsage(TensorUsage::Scales, i)) ) + { + Failure(op, "Normalization coefficient tensor present", "LSTM with gate normalization is not supported"); + return false; + } + } + + return true; +} + void TfLiteSupportedOperators::Failure(const Operation *op, const std::string &message, const std::string &constraint) { assert(op); @@ -1011,6 +1058,7 @@ TfLiteSupportedOperators::TfLiteSupportedOperators(IArchitectureConstraints *con &TfLiteSupportedOperators::ConstraintTransposeDims, &TfLiteSupportedOperators::ConstraintStridedSlice, &TfLiteSupportedOperators::ConstraintLog, + &TfLiteSupportedOperators::ConstraintLSTM, }; } diff --git a/ethosu/regor/tflite/tflite_supported_operators.hpp b/ethosu/regor/tflite/tflite_supported_operators.hpp index 0b7126e7..811cb6c2 100644 --- a/ethosu/regor/tflite/tflite_supported_operators.hpp +++ b/ethosu/regor/tflite/tflite_supported_operators.hpp @@ -76,6 +76,7 @@ private: bool ConstraintTransposeDims(const Operation *op); bool ConstraintStridedSlice(const Operation *op); bool ConstraintLog(const Operation *op); + bool ConstraintLSTM(const Operation *op); }; // Factory for supported-ops checkers diff --git a/ethosu/regor/tflite/tflite_supported_operators_u55.cpp b/ethosu/regor/tflite/tflite_supported_operators_u55.cpp index f49ccdb8..5bad24e3 100644 --- a/ethosu/regor/tflite/tflite_supported_operators_u55.cpp +++ b/ethosu/regor/tflite/tflite_supported_operators_u55.cpp @@ -80,6 +80,7 @@ TfLiteSupportedOperatorsU55::TfLiteSupportedOperatorsU55(IArchitectureConstraint OpType::HardSwish, OpType::MemoryCopy, OpType::Log, + OpType::UnidirectionalSequenceLstm, // clang-format on }; _supportedDataTypes = { diff --git a/ethosu/regor/tflite/tflite_supported_operators_u85.cpp b/ethosu/regor/tflite/tflite_supported_operators_u85.cpp index 9b9fc635..db231e77 100644 --- a/ethosu/regor/tflite/tflite_supported_operators_u85.cpp +++ b/ethosu/regor/tflite/tflite_supported_operators_u85.cpp @@ -101,6 +101,7 @@ TfLiteSupportedOperatorsU85::TfLiteSupportedOperatorsU85(IArchitectureConstraint OpType::ReduceAll, OpType::MemoryCopy, OpType::Log, + OpType::UnidirectionalSequenceLstm, // clang-format on }; _supportedDataTypes = { diff --git a/ethosu/regor/tflite/tflite_writer.cpp b/ethosu/regor/tflite/tflite_writer.cpp index 584635ee..19eebc06 100644 --- a/ethosu/regor/tflite/tflite_writer.cpp +++ b/ethosu/regor/tflite/tflite_writer.cpp @@ -169,12 +169,20 @@ std::unique_ptr TfLiteWriter::SerialiseImpl(const std::vector inputs, outputs; + std::vector inputs, outputs, intermediates; for ( const auto &tensor : SortedInputTensors(operation, type) ) { // Skip placeholder tensors if ( graph->IsPlaceholder(tensor) ) continue; - inputs.push_back(SerialisedTensorIndex(tensor, tensor_address_map, *graph)); + if ( (operation->UsageOfTensor(tensor) & TensorUsage::TypeMask) == TensorUsage::Scratch ) + { + // Scratch usage means this is an intermediate tensor + intermediates.push_back(SerialisedTensorIndex(tensor, tensor_address_map, *graph)); + } + else + { + inputs.push_back(SerialisedTensorIndex(tensor, tensor_address_map, *graph)); + } } for ( const auto &connection : operation->Outputs() ) { @@ -188,7 +196,6 @@ std::unique_ptr TfLiteWriter::SerialiseImpl(const std::vector> custom_options = 0; flatbuffers::Offset> mvi = 0; // mutating_variable_inputs - flatbuffers::Offset> intermediates = 0; uint64_t large_custom_options_offset = 0; uint64_t large_custom_options_size = 0; @@ -212,16 +219,16 @@ std::unique_ptr TfLiteWriter::SerialiseImpl(const std::vectorcustom_options_format(); custom_options = FlatbufferUtils::CopyVector(_flatbuffer, tflite_operator->custom_options()); mvi = FlatbufferUtils::CopyVector(_flatbuffer, tflite_operator->mutating_variable_inputs()); - intermediates = FlatbufferUtils::CopyVector(_flatbuffer, tflite_operator->intermediates()); } auto serialised_inputs = _flatbuffer.CreateVector(inputs); + auto serialised_intermediates = _flatbuffer.CreateVector(intermediates); auto serialised_outputs = _flatbuffer.CreateVector(outputs); auto serialised_options = SerialiseOptions(operation, type); auto serialised_options2 = SerialiseOptions2(operation, type); _serialised_operations.push_back(tflite::CreateOperator(_flatbuffer, opcode_index, serialised_inputs, serialised_outputs, - builtin_options_type, serialised_options, custom_options, custom_options_format, mvi, intermediates, + builtin_options_type, serialised_options, custom_options, custom_options_format, mvi, serialised_intermediates, large_custom_options_offset, large_custom_options_size, builtin_options_2_type, serialised_options2)); } -- GitLab