From 92013dad07e0b861e4388fde7bfa216e5e347839 Mon Sep 17 00:00:00 2001
From: Jacob Bohlin <jacob.bohlin@arm.com>
Date: Fri, 25 Apr 2025 08:44:07 +0100
Subject: [PATCH] MLBEDSW-8926 Port LSTM to Regor

* Ported lowering of TFLite::UnidirectionalSequenceLstm to Regor.
* Added reading of TFLite intermediate tensors. Added a new
TensorUsage::Intermediate for these tensors.
* Added logic to allocate tensors which point to the same buffer to the
same address, enabling this to be controlled in GraphIR.
* Added an optional Tag to the Buffer hash function in order to
differentiate between multiple empty buffers which stem from different
TFLite variable tensors.
* Added missing rescaling for Sigmoid and Tanh when fused with
 Elementwise Add, Sub or Mul.
* Added some limitations to persistent tensors:
  - They are now required to be in linear format.
  - They can not share memory with non-persistent tensors.
* Made a small modification to graph traversal so that partial writes
  are processed in the order they are added to the graph.
* Added supported operator checks for UnidirectionalSequenceLstm.

Change-Id: I6bd08822a41dca48b3aa8091b07747327b37d68f
Signed-off-by: Jacob Bohlin <jacob.bohlin@arm.com>
---
 ethosu/regor/CMakeLists.txt                   |   1 +
 .../ethosu55/ethos_u55_scaling.cpp            |  11 +-
 .../ethosu85/ethos_u85_scaling.cpp            |  12 +-
 ethosu/regor/common/buffer_view.hpp           |  30 +-
 ethosu/regor/compiler/attributes.cpp          |   1 +
 ethosu/regor/compiler/attributes.hpp          |  12 +
 ethosu/regor/compiler/graph.hpp               |  20 +-
 ethosu/regor/compiler/lstm.cpp                | 401 ++++++++++++++++++
 ethosu/regor/compiler/lstm.hpp                |  67 +++
 ethosu/regor/compiler/operation.hpp           |   2 +
 ethosu/regor/compiler/operation_util.hpp      |  27 ++
 ethosu/regor/compiler/quantization.hpp        |   7 +
 ethosu/regor/compiler/scheduler.cpp           |   4 +-
 ethosu/regor/compiler/scheduler_packing.cpp   |  27 ++
 ethosu/regor/compiler/scheduler_packing.hpp   |   1 +
 .../regor/compiler/tflite_graph_optimiser.cpp |  11 +
 .../regor/compiler/tflite_graph_optimiser.hpp |   3 +
 ethosu/regor/test/test_passthrough.cpp        |  18 +-
 ethosu/regor/tflite/tflite_mapping.cpp        |  24 ++
 .../regor/tflite/tflite_model_semantics.cpp   |   8 +-
 ethosu/regor/tflite/tflite_reader.cpp         |  78 +++-
 .../tflite/tflite_supported_operators.cpp     |  48 +++
 .../tflite/tflite_supported_operators.hpp     |   1 +
 .../tflite/tflite_supported_operators_u55.cpp |   1 +
 .../tflite/tflite_supported_operators_u85.cpp |   1 +
 ethosu/regor/tflite/tflite_writer.cpp         |  17 +-
 26 files changed, 786 insertions(+), 47 deletions(-)
 create mode 100644 ethosu/regor/compiler/lstm.cpp
 create mode 100644 ethosu/regor/compiler/lstm.hpp

diff --git a/ethosu/regor/CMakeLists.txt b/ethosu/regor/CMakeLists.txt
index 80aa074f..f315b48a 100644
--- a/ethosu/regor/CMakeLists.txt
+++ b/ethosu/regor/CMakeLists.txt
@@ -288,6 +288,7 @@ regor_lib(
         "compiler/scheduler_packing.cpp"
         "compiler/scheduler_operation.cpp"
         "compiler/softmax.cpp"
+        "compiler/lstm.cpp"
         "compiler/tensor.cpp"
         "compiler/tensor_allocator.cpp"
         "compiler/tflite_graph_optimiser.cpp"
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_scaling.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_scaling.cpp
index ce88b528..4af22546 100644
--- a/ethosu/regor/architecture/ethosu55/ethos_u55_scaling.cpp
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_scaling.cpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2021-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -75,12 +75,20 @@ void RescaleElementwise(HLCOperation *op)
     DataType ifmDataType = op->ifm[0].dataType;
     OpType opType = op->type;
 
+    double effectiveScale = 0;
+    if ( !op->subOps.empty() && (op->subOps[0].type == OpType::Sigmoid || op->subOps[0].type == OpType::Tanh) )
+    {
+        // Adjust for Sigmoid/Tanh effective output scale.
+        effectiveScale = 1.0 / 0x3000;
+    }
+
     bool allHaveScale =
         (!ifm1Quant->scales.empty() && !ofmQuant->scales.empty() && ifm2Quant != nullptr && !ifm2Quant->scales.empty());
     if ( opType == OpType::Mul )
     {
         if ( allHaveScale )
         {
+            ofmScale = effectiveScale ? effectiveScale : ofmScale;
             outScale = ElementwiseMulScale(ifm1Scale, ifm2Scale, ofmScale);
         }
     }
@@ -95,6 +103,7 @@ void RescaleElementwise(HLCOperation *op)
     }
     else if ( opType == OpType::Add || opType == OpType::Sub )
     {
+        ofmScale = effectiveScale ? effectiveScale : ofmScale;
         int bitDepth = DataTypeSizeBits(ifmDataType);
         bool useAdvancedScaling = false;
         uint32_t opaScale = 1;
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_scaling.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_scaling.cpp
index fafa2d57..b7171177 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85_scaling.cpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_scaling.cpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2021-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -132,6 +132,14 @@ void RescaleElementwise(HLCOperation *op)
     DataType ifmDataType = op->ifm[0].dataType;
     OpType opType = op->type;
 
+
+    double effectiveScale = 0;
+    if ( !op->subOps.empty() && (op->subOps[0].type == OpType::Sigmoid || op->subOps[0].type == OpType::Tanh) )
+    {
+        // Adjust for Sigmoid/Tanh effective output scale.
+        effectiveScale = 1.0 / 0x3000;
+    }
+
     bool allHaveScale =
         (!ifm1Quant->scales.empty() && !ofmQuant->scales.empty() && ifm2Quant != nullptr && !ifm2Quant->scales.empty());
 
@@ -144,6 +152,7 @@ void RescaleElementwise(HLCOperation *op)
     {
         if ( allHaveScale )
         {
+            ofmScale = effectiveScale ? effectiveScale : ofmScale;
             outScale = ElementwiseMulScale(ifm1Scale, ifm2Scale, ofmScale);
         }
     }
@@ -159,6 +168,7 @@ void RescaleElementwise(HLCOperation *op)
     {
         if ( allHaveScale )
         {
+            ofmScale = effectiveScale ? effectiveScale : ofmScale;
             AdvancedElementwiseAddSubScale(ifm1Scale, ifm2Scale, ofmScale, bitDepth, input1Scale, input2Scale, outScale);
         }
     }
diff --git a/ethosu/regor/common/buffer_view.hpp b/ethosu/regor/common/buffer_view.hpp
index 494172fc..0dc848e6 100644
--- a/ethosu/regor/common/buffer_view.hpp
+++ b/ethosu/regor/common/buffer_view.hpp
@@ -331,15 +331,27 @@ public:
 
     void Rehash()
     {
-        // Calculate MD5 hash of data, prefixed by the size of data
-        const auto buffer = const_cast<const Buffer *>(this);
-        std::string sizeStr("<");
-        sizeStr += std::to_string(buffer->Size());
-        sizeStr += '>';
-        MD5 hash;
-        hash.Combine(reinterpret_cast<uint8_t *>(sizeStr.data()), int(sizeStr.size()));
-        hash.Combine(buffer->Data<uint8_t>(), buffer->Size());
-        hash.Get(_dataHash);
+        if ( Size() > 0 )
+        {
+            // Calculate MD5 hash of data, prefixed by the size of data
+            std::string sizeStr("<");
+            sizeStr += std::to_string(Size());
+            sizeStr += '>';
+            MD5 hash;
+            // Make sure the const overload of Data() is called
+            const uint8_t *data = std::as_const(*this).Data<uint8_t>();
+            hash.Combine(reinterpret_cast<uint8_t *>(sizeStr.data()), int(sizeStr.size()));
+            hash.Combine(data, Size());
+            hash.Get(_dataHash);
+        }
+        else
+        {
+            // If the buffer is empty use the pointer to this buffer object as a hash to
+            // disambiguate between different empty buffers.
+            uintptr_t ptr = reinterpret_cast<uintptr_t>(this);
+            _dataHash.v32[0] = _dataHash.v32[1] = static_cast<uint32_t>(ptr);
+            _dataHash.v32[2] = _dataHash.v32[3] = static_cast<uint32_t>(ptr >> 32);
+        }
     }
 
 private:
diff --git a/ethosu/regor/compiler/attributes.cpp b/ethosu/regor/compiler/attributes.cpp
index 659745ab..367bc1e5 100644
--- a/ethosu/regor/compiler/attributes.cpp
+++ b/ethosu/regor/compiler/attributes.cpp
@@ -55,6 +55,7 @@ DynamicRef CreateAttribute(uint32_t reducedHash)
         CASE_MAKE_ATTR_INSTANCE(transpose_conv2d_attr_t);
         CASE_MAKE_ATTR_INSTANCE(while_attr_t);
         CASE_MAKE_ATTR_INSTANCE(mirror_pad_mode_attr_t);
+        CASE_MAKE_ATTR_INSTANCE(unidirectional_sequence_lstm_attr_t);
         default:
             assert(false && "No attribute has this reduced hash");
             // Add a new XXX_attr_t struct to the header then
diff --git a/ethosu/regor/compiler/attributes.hpp b/ethosu/regor/compiler/attributes.hpp
index 4ce8260e..8f55caeb 100644
--- a/ethosu/regor/compiler/attributes.hpp
+++ b/ethosu/regor/compiler/attributes.hpp
@@ -280,6 +280,18 @@ struct mirror_pad_mode_attr_t
     END_FIELD_TABLE()
 };
 
+struct unidirectional_sequence_lstm_attr_t
+{
+    int cell_clip;
+    int projection_clip;
+    bool time_major;
+    BEGIN_FIELD_TABLE(unidirectional_sequence_lstm_attr_t)
+        ATTR_FIELD(cell_clip, 0)
+        ATTR_FIELD(projection_clip, 1)
+        ATTR_FIELD(time_major, 2)
+    END_FIELD_TABLE()
+};
+
 #define REDUCED_HASH(hash) (hash & 0x000FFFFF)
 
 DynamicRef CreateAttribute(uint32_t hash);
diff --git a/ethosu/regor/compiler/graph.hpp b/ethosu/regor/compiler/graph.hpp
index c4c7bbd3..90b3d72b 100644
--- a/ethosu/regor/compiler/graph.hpp
+++ b/ethosu/regor/compiler/graph.hpp
@@ -151,6 +151,14 @@ public:
 
     void SetScheduledOrder(std::vector<Operation *> operations) { _opsInScheduledOrder = std::move(operations); }
 
+    // Traverse the graph in right-to-left reverse post-order but processing tensor writers left-to-right.
+    // This means in below graph, where A and B both write to the input tensor of C, A will be processed
+    // before B.
+    //            A   B
+    //             \ /
+    //              |
+    //              C
+    // The rationale is to preserve the order that partial writes are added to the graph.
     template<typename OPFUNC>
     static void TraverseGraphFromEnd(const std::vector<std::shared_ptr<Tensor>> &from, OPFUNC opFunc)
     {
@@ -166,9 +174,10 @@ public:
 
         for ( const auto &tensor : from )
         {
-            for ( const auto &op : tensor->Writers() )
+            const auto &writers = tensor->Writers();
+            for ( auto it = writers.crbegin(); it != writers.crend(); it++ )
             {
-                stack.emplace(false, op);
+                stack.emplace(false, *it);
             }
         }
 
@@ -189,11 +198,12 @@ public:
                 stack.emplace(true, entry.op);
                 for ( const auto &pair : entry.op->Inputs().pairs() )
                 {
-                    for ( const auto &op : pair.second.tensor->Writers() )
+                    const auto &writers = pair.second.tensor->Writers();
+                    for ( auto it = writers.crbegin(); it != writers.crend(); it++ )
                     {
-                        if ( visited.count(op.get()) == 0 )
+                        if ( visited.count(it->get()) == 0 )
                         {
-                            stack.emplace(false, op);
+                            stack.emplace(false, *it);
                         }
                     }
                 }
diff --git a/ethosu/regor/compiler/lstm.cpp b/ethosu/regor/compiler/lstm.cpp
new file mode 100644
index 00000000..f112770e
--- /dev/null
+++ b/ethosu/regor/compiler/lstm.cpp
@@ -0,0 +1,401 @@
+//
+// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "lstm.hpp"
+
+#include "operation_util.hpp"
+#include "quantization.hpp"
+
+namespace regor
+{
+
+static constexpr double Q0_15_SCALE = 1.0 / (1 << 15);
+static constexpr double Q3_12_SCALE = 1.0 / (1 << 12);
+
+LSTM::LSTM(Operation *operation, OptimiserDatabase *db, Graph *graph) : _lstmOp(operation), _db(db), _graph(graph)
+{
+    assert(_lstmOp->Type() == OpType::UnidirectionalSequenceLstm);
+
+    // Attributes
+    assert(operation->HasAttribute<unidirectional_sequence_lstm_attr_t>());
+    auto *attr = operation->Attribute<unidirectional_sequence_lstm_attr_t>();
+    _isTimeMajor = attr->time_major;
+    _cellClip = attr->cell_clip;
+
+    // Input/Output
+    _ifmConn = _lstmOp->Input(TensorUsage::IFM);
+    _ofmConn = _lstmOp->Output(TensorUsage::OFM);
+
+    // Input dimensions
+    Shape ifmShape = _ifmConn->shape;
+    _nFeature = ifmShape[-1];
+    _nTime = ifmShape[_isTimeMajor ? 0 : 1];
+    _nBatch = ifmShape[_isTimeMajor ? 1 : 0];
+}
+
+void LSTM::RecordOptimisation(Operation *op)
+{
+    if ( _db )
+    {
+        _db->AddOptimised(_lstmOp, op);
+    }
+}
+
+Operation *LSTM::ConvertOp()
+{
+    Operation *returnOp = _lstmOp;
+    int numBatches = _isTimeMajor ? 1 : _nBatch;
+    for ( int batch = 0; batch < numBatches; batch++ )
+    {
+        TensorConnection *outputState = GetInitialState(TensorUsage::State, batch);
+        TensorConnection *cellState = GetInitialState(MakeTensorUsage(TensorUsage::State, 1), batch);
+        for ( int time = 0; time < _nTime; time++ )
+        {
+            TensorConnection *feature = ExtractFeatureSlice(time, batch);
+            assert(feature);
+            std::tie(outputState, cellState) = Step(feature, outputState, cellState, time, batch);
+            returnOp = SetOutputWrite(outputState, time, batch);
+        }
+    }
+
+    if ( returnOp != _lstmOp )
+    {
+        _lstmOp->Disconnect();
+    }
+    return returnOp;
+}
+
+// Extract time and batch slice of the input tensor.
+TensorConnection *LSTM::ExtractFeatureSlice(int time, int batch)
+{
+    std::shared_ptr<Tensor> featureTensor = _ifmConn->tensor->Clone();
+    featureTensor->SetName(fmt::format("{0}_feauture_b{1}.t{2}", featureTensor->Name(), batch, time));
+    featureTensor->SetStorageShape({(_isTimeMajor ? _nBatch : 1), _nFeature});
+    auto op = std::make_shared<Operation>(OpType::Slice);
+
+    auto readShape = featureTensor->StorageShape();
+    auto readOffset = _isTimeMajor ? Shape(time, 0, 0) : Shape(batch, time, 0);
+    auto *attr = op->Attribute<slice_attr_t>();
+    attr->size = readShape;
+    attr->begin = readOffset;
+
+    op->CopyInput(TensorUsage::IFM, *_ifmConn);
+    op->ConnectOutput(TensorUsage::OFM, featureTensor).Set(_ifmConn->quantization);
+    RecordOptimisation(op.get());
+    return op->Output(TensorUsage::OFM);
+}
+
+// Get state tensor for provided state type and batch
+TensorConnection *LSTM::GetInitialState(TensorUsage stateUsage, int batch)
+{
+    TensorConnection *stateConn = _lstmOp->Input(stateUsage);
+    if ( _isTimeMajor )
+    {
+        // For time major, return the state tensor directly since all
+        // batches are calculated in the same step.
+        return stateConn;
+    }
+    else
+    {
+        // For batch major, return one batch slice of the state tensor.
+        // The tensor has to be cloned in order to resolve graph dependencies correctly but
+        // the clone will share the underlying buffer with the original state tensor which
+        // ensure they are allocated to the same address.
+        std::shared_ptr<Tensor> newStateTensor = stateConn->tensor->Clone();
+
+        // Set read/write shape to be one batch and read/write offset to the current batch.
+        TensorSlice slice({0, 0, batch, 0}, {1, 1, 1, stateConn->shape[-1]});
+        const auto &stateQuant = stateConn->quantization;
+
+        auto op = std::make_shared<Operation>(OpType::MemoryCopy);
+        op->ConnectInput(TensorUsage::IFM, stateConn->tensor).Set(slice).Set(stateQuant);
+        op->ConnectOutput(TensorUsage::OFM, newStateTensor).Set(slice).Set(stateQuant);
+
+        // Mark the cloned tensor as persistent to require linear format and avoid fusing with
+        // other tensors.
+        _graph->AddPersistent(newStateTensor);
+        RecordOptimisation(op.get());
+        return op->Output(TensorUsage::OFM);
+    }
+}
+
+// Setup the correct read shape and offset for reading from a state tensor.
+void LSTM::SetStateRead(Operation *op, int batch)
+{
+    if ( !_isTimeMajor && _nBatch > 1 )
+    {
+        Shape cellStateShape = _lstmOp->Input(MakeTensorUsage(TensorUsage::State, 1))->shape;
+        const Shape ifmShape = op->Input(TensorUsage::IFM)->shape;
+        op->Input(TensorUsage::IFM)->Set(cellStateShape).Set({{0, 0, batch, 0}, {1, 1, 1, ifmShape[-1]}});
+    }
+}
+
+// Write the state for the provided batch by pointing the operations ofm to the state tensor.
+void LSTM::SetStateWrite(Operation *op, TensorUsage stateUsage, int batch)
+{
+    TensorConnection *stateConn = _lstmOp->Input(stateUsage);
+
+    auto ofmConn = op->Output(TensorUsage::OFM);
+    auto ofmShape = Shape::PadAxes(ofmConn->shape, 4, 1);
+
+    std::shared_ptr<Tensor> newStateTensor = stateConn->tensor->Clone();
+    op->ConnectOutput(TensorUsage::OFM, newStateTensor).Set(stateConn->shape).Set(stateConn->quantization);
+
+    if ( !_isTimeMajor && _nBatch > 1 )
+    {
+        auto writeOffset = Shape(0, 0, batch, 0);
+        ofmConn->Set({writeOffset, ofmShape});
+    }
+
+    // Mark the cloned tensor as persistent to require linear format and avoid fusing with
+    // other tensors.
+    _graph->AddPersistent(newStateTensor);
+}
+
+// Copy the output state to the time/batch slice of the final output.
+Operation *LSTM::SetOutputWrite(TensorConnection *stateConn, int time, int batch)
+{
+    auto concatOp = std::make_shared<Operation>(OpType::MemoryCopy);
+
+    auto concatIfmConn = &concatOp->ConnectInput(TensorUsage::IFM, stateConn->tensor).Set(stateConn->shape).Set(_ofmConn->quantization);
+
+    if ( !_isTimeMajor && _nBatch > 1 )
+    {
+        Shape readOffset(0, 0, batch, 0);
+        Shape readSize(1, 1, 1, stateConn->shape[-1]);
+        concatIfmConn->Set({readOffset, readSize});
+    }
+
+    Shape writeOffset = _isTimeMajor ? Shape(0, time, 0, 0) : Shape(0, batch, time, 0);
+    Shape writeShape = _isTimeMajor ? Shape(1, 1, stateConn->shape[-2], stateConn->shape[-1]) : Shape(1, 1, 1, stateConn->shape[-1]);
+    concatOp->ConnectOutput(TensorUsage::OFM, _ofmConn->tensor)
+        .Set(_ofmConn->shape)
+        .Set(_ofmConn->quantization)
+        .Set({writeOffset, writeShape})
+        .Set(RoundMode::NATURAL);
+
+    RecordOptimisation(concatOp.get());
+    return concatOp.get();
+}
+
+// Generate a gate for the provided input and weights
+// Activation( Add( FullyConnected(input feature), FullyConnected(output state) ) )
+TensorConnection *LSTM::CalculateGate(const std::string &name, TensorConnection *featureConn, TensorConnection *stateConn,
+    TensorConnection *inputWeightConn, TensorConnection *inputBiasConn, TensorConnection *recurrentWeightConn, OpType activationType, int batch)
+{
+    // Setup fullyconnected output quantization
+    Quantization fcQuant;
+    fcQuant.type = QuantizationType::TFLITE;
+    fcQuant.scales = {Q3_12_SCALE};
+    fcQuant.zeroPoints = {0};
+
+    Operation *inputFC = CreateFullyConnected(fmt::format("{0}_feature_fc", name), featureConn->tensor,
+        inputWeightConn->tensor, featureConn->quantization, inputWeightConn->quantization, fcQuant,
+        featureConn->SliceShape(), DataType::Int16, inputBiasConn->tensor, inputBiasConn->quantization);
+    TensorConnection *inputFCOfmConn = inputFC->Output(TensorUsage::OFM);
+
+    Operation *recurrentFC = CreateFullyConnected(fmt::format("{0}_recurrent_fc", name), stateConn->tensor,
+        recurrentWeightConn->tensor, stateConn->quantization, recurrentWeightConn->quantization, fcQuant,
+        stateConn->SliceShape(), DataType::Int16);
+    SetStateRead(recurrentFC, batch);
+    TensorConnection *recurrentFCOfmConn = recurrentFC->Output(TensorUsage::OFM);
+
+    Quantization addQuant;
+    addQuant.type = QuantizationType::TFLITE;
+    addQuant.scales = {1.0f};
+    addQuant.zeroPoints = {0};
+    Operation *add = CreateAdd(inputFCOfmConn->tensor, recurrentFCOfmConn->tensor, inputFCOfmConn->quantization,
+        recurrentFCOfmConn->quantization, addQuant);
+
+    // Create activation function
+    Quantization activationQuant;
+    activationQuant.type = QuantizationType::TFLITE;
+    activationQuant.scales = {1.0f};
+    activationQuant.zeroPoints = {0};
+
+    auto activation = std::make_shared<Operation>(activationType);
+    auto addOfmTensor = add->Output(TensorUsage::OFM)->tensor;
+
+    activation->ConnectInput(TensorUsage::IFM, addOfmTensor).Set(addQuant);
+    activation->ConnectOutput(TensorUsage::OFM, addOfmTensor->Clone()).Set(activationQuant);
+
+    auto returnConn = activation->Output(TensorUsage::OFM);
+    if ( activationType == OpType::Sigmoid )
+    {
+        // For Sigmoid we need to set the activation min/max values to match the possible range
+        // in the reference. The values below are the quantized min/max values that the reference
+        // can achive for the LUT based Sigmoid/Logistic. (The NPU does however have a larger range
+        // due to intermediate higher precision.)
+        auto clamp = std::make_shared<Operation>(OpType::Clamp);
+        auto *attr = clamp->Attribute<clamp_attr_t>();
+        attr->max = Quantize(32757.0f, activationQuant);
+        attr->min = Quantize(11.0f, activationQuant);
+
+        // Copying the input and output of the Add means the Clamp will also write to the cell state.
+        clamp->CopyInput(TensorUsage::IFM, *returnConn);
+        clamp->CopyOutput(TensorUsage::OFM, *returnConn);
+
+        RecordOptimisation(clamp.get());
+        returnConn = clamp->Output(TensorUsage::OFM);
+    }
+
+    RecordOptimisation(inputFC);
+    RecordOptimisation(recurrentFC);
+    RecordOptimisation(add);
+    RecordOptimisation(activation.get());
+    return returnConn;
+}
+
+// Calculate and update the cell state from the provided gates
+// Clip( Add( Mul( cell state, forget gate ), Mul( cell gate, input gate ) ) )
+TensorConnection *LSTM::CalculateCellState(TensorConnection *cellStateConn, TensorConnection *inputGateConn,
+    TensorConnection *forgetGateConn, TensorConnection *cellGateConn, int time, int batch)
+{
+    const Quantization &cellStateQuant = cellStateConn->quantization;
+    double cellStateScale = cellStateQuant.scales[0].Dequantize();
+    // Calculate explicit scales based on the cell state quantization
+    Quantization mulCFQuant;
+    mulCFQuant.type = QuantizationType::TFLITE;
+    mulCFQuant.scales = {ElementwiseMulScale(cellStateScale, Q0_15_SCALE, cellStateScale)};
+    mulCFQuant.zeroPoints = {cellStateConn->quantization.zeroPoints[0]};
+    // Create Mul(cell_state, forget_gate)
+    Operation *mulCF = CreateMul(cellStateConn->tensor, forgetGateConn->tensor, mulCFQuant, mulCFQuant, mulCFQuant,
+        DataType::None, &forgetGateConn->shape, &forgetGateConn->shape);
+    SetStateRead(mulCF, batch);
+
+    // Calculate explicit scales based on the cell state quantization
+    Quantization mulCIQuant;
+    mulCIQuant.type = QuantizationType::TFLITE;
+    mulCIQuant.scales = {ElementwiseMulScale(Q0_15_SCALE, Q0_15_SCALE, cellStateScale)};
+    mulCIQuant.zeroPoints = {cellStateConn->quantization.zeroPoints[0]};
+    // Create Mul(cell_gate, input_gate)
+    Operation *mulCI = CreateMul(cellGateConn->tensor, inputGateConn->tensor, mulCIQuant, mulCIQuant, mulCIQuant);
+
+    // Create Add with cell state quantization
+    Operation *add = CreateAdd(mulCF->Output(TensorUsage::OFM)->tensor, mulCI->Output(TensorUsage::OFM)->tensor,
+        cellStateQuant, cellStateQuant, cellStateQuant);
+    // Redirect the ofm of Add to cell state.
+    SetStateWrite(add, MakeTensorUsage(TensorUsage::State, 1), batch);
+
+    RecordOptimisation(mulCF);
+    RecordOptimisation(mulCI);
+    RecordOptimisation(add);
+
+    TensorConnection *returnConn = add->Output(TensorUsage::OFM);
+    if ( _cellClip != 0 )
+    {
+        // If the cell clip attribute is non-zero the output needs to be clamped.
+        auto clamp = std::make_shared<Operation>(OpType::Clamp);
+        auto *attr = clamp->Attribute<clamp_attr_t>();
+        attr->max = Quantize(static_cast<float>(_cellClip), cellStateQuant);
+        attr->min = Quantize(static_cast<float>(-_cellClip), cellStateQuant);
+
+        // Copying the input and output of the Add means the Clamp will also write to the cell state.
+        clamp->CopyInput(TensorUsage::IFM, *returnConn);
+        clamp->CopyOutput(TensorUsage::OFM, *returnConn);
+
+        RecordOptimisation(clamp.get());
+        returnConn = clamp->Output(TensorUsage::OFM);
+    }
+
+    return returnConn;
+}
+
+// Calculate and update the output state from the provided gate output
+// Mul( Tanh(cell state), output gate )
+TensorConnection *LSTM::CalculateOutputState(TensorConnection *outputGateConn, TensorConnection *cellStateConn, int time, int batch)
+{
+    // Setup tanh quantization
+    Quantization tanhQuant;
+    tanhQuant.type = QuantizationType::TFLITE;
+    tanhQuant.scales = {QuantizedScale(Q0_15_SCALE)};
+    tanhQuant.zeroPoints = {0};
+
+    // Create tanh(cell state)
+    auto tanh = std::make_shared<Operation>(OpType::Tanh);
+    tanh->ConnectInput(TensorUsage::IFM, cellStateConn->tensor).Set(cellStateConn->shape).Set(cellStateConn->quantization);
+
+    // Tanh reads from the cell state. This may set an ifm slice which the ofm shape needs to honor.
+    SetStateRead(tanh.get(), batch);
+    auto tanhIfmConn = tanh->Input(TensorUsage::IFM);
+    Shape tanhOfmShape = tanhIfmConn->SliceShape();
+    // Create a new tensor for ofm instead of cloning, this ensures that the tanh output will not
+    // overwrite the cell state.
+    auto ofmName = fmt::format("{0}_tanh_b{1}.t{2}", cellStateConn->tensor->Name(), batch, time);
+    auto tanhOfm = std::make_shared<Tensor>(ofmName, cellStateConn->tensor->Type(), tanhOfmShape);
+    tanh->ConnectOutput(TensorUsage::OFM, tanhOfm).Set(tanhQuant);
+
+    // Create Mul( Tanh, output gate )
+    // Ofm quantization is based on the hidden scale.
+    double hiddenScale = _lstmOp->Input(MakeTensorUsage(TensorUsage::Scratch, 4))->quantization.scales[0].Dequantize();
+    auto mulQuant = _ofmConn->quantization;
+    mulQuant.type = QuantizationType::TFLITE;
+    mulQuant.scales = {ElementwiseMulScale(Q0_15_SCALE, Q0_15_SCALE, hiddenScale)};
+    Operation *mul = CreateMul(tanhOfm, outputGateConn->tensor, tanhQuant, tanhQuant, mulQuant, _ifmConn->tensor->Type());
+
+    // Save new output state
+    SetStateWrite(mul, TensorUsage::State, batch);
+
+    RecordOptimisation(tanh.get());
+    RecordOptimisation(mul);
+    return mul->Output(TensorUsage::OFM);
+}
+
+
+// Generate one step of the LSTM for the provided feature, batch and time
+std::pair<TensorConnection *, TensorConnection *> LSTM::Step(TensorConnection *featureConn,
+    TensorConnection *outputStateConn, TensorConnection *cellStateConn, int time, int batch)
+{
+    assert(outputStateConn && cellStateConn);
+    auto suffix = fmt::format("b{0}.t{1}", batch, time);
+
+    auto inputToInputWeightConn = _lstmOp->Input(MakeTensorUsage(TensorUsage::Weights, 0));
+    auto recurrentToInputWeightConn = _lstmOp->Input(MakeTensorUsage(TensorUsage::Weights, 4));
+    auto inputBiasConn = _lstmOp->Input(MakeTensorUsage(TensorUsage::Scales, 0));
+    TensorConnection *inputGate = CalculateGate(fmt::format("input_gate_{0}", suffix), featureConn, outputStateConn,
+        inputToInputWeightConn, inputBiasConn, recurrentToInputWeightConn, OpType::Sigmoid, batch);
+
+    auto inputToForgetWeightConn = _lstmOp->Input(MakeTensorUsage(TensorUsage::Weights, 1));
+    auto recurrentToForgetWeightConn = _lstmOp->Input(MakeTensorUsage(TensorUsage::Weights, 5));
+    auto forgetBiasConn = _lstmOp->Input(MakeTensorUsage(TensorUsage::Scales, 1));
+    TensorConnection *forgetGate = CalculateGate(fmt::format("forget_gate_{0}", suffix), featureConn, outputStateConn,
+        inputToForgetWeightConn, forgetBiasConn, recurrentToForgetWeightConn, OpType::Sigmoid, batch);
+
+    auto inputToCellWeightConn = _lstmOp->Input(MakeTensorUsage(TensorUsage::Weights, 2));
+    auto recurrentToCellWeightConn = _lstmOp->Input(MakeTensorUsage(TensorUsage::Weights, 6));
+    auto cellBiasConn = _lstmOp->Input(MakeTensorUsage(TensorUsage::Scales, 2));
+    TensorConnection *cellGate = CalculateGate(fmt::format("cell_gate_{0}", suffix), featureConn, outputStateConn,
+        inputToCellWeightConn, cellBiasConn, recurrentToCellWeightConn, OpType::Tanh, batch);
+
+    // Calculate and update cell state
+    cellStateConn = CalculateCellState(cellStateConn, inputGate, forgetGate, cellGate, time, batch);
+
+    auto inputToOutputWeightConn = _lstmOp->Input(MakeTensorUsage(TensorUsage::Weights, 3));
+    auto recurrentToOutputWeightConn = _lstmOp->Input(MakeTensorUsage(TensorUsage::Weights, 7));
+    auto outputBiasConn = _lstmOp->Input(MakeTensorUsage(TensorUsage::Scales, 3));
+    TensorConnection *outputGate = CalculateGate(fmt::format("output_gate_{0}", suffix), featureConn, outputStateConn,
+        inputToOutputWeightConn, outputBiasConn, recurrentToOutputWeightConn, OpType::Sigmoid, batch);
+
+    // Calculate and update ouput state
+    assert(cellStateConn);
+    outputStateConn = CalculateOutputState(outputGate, cellStateConn, time, batch);
+
+    return {outputStateConn, cellStateConn};
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/lstm.hpp b/ethosu/regor/compiler/lstm.hpp
new file mode 100644
index 00000000..fbe2eb1d
--- /dev/null
+++ b/ethosu/regor/compiler/lstm.hpp
@@ -0,0 +1,67 @@
+//
+// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "graph_optimiser.hpp"
+#include "operation.hpp"
+
+namespace regor
+{
+
+/// <summary>
+/// TFLite Graph optimiser LSTM rewriter
+/// </summary>
+class LSTM
+{
+private:
+    Operation *_lstmOp = nullptr;
+    OptimiserDatabase *_db = nullptr;
+    Graph *_graph = nullptr;
+
+    // Dimensions
+    int _nFeature, _nTime, _nBatch;
+    // Attributes
+    int _cellClip;
+    bool _isTimeMajor;
+    // Input/Output
+    TensorConnection *_ifmConn = nullptr;
+    TensorConnection *_ofmConn = nullptr;
+
+public:
+    LSTM(Operation *operation, OptimiserDatabase *db, Graph *graph);
+    Operation *ConvertOp();
+
+private:
+    void RecordOptimisation(Operation *op);
+    TensorConnection *ExtractFeatureSlice(int time, int batch);
+    TensorConnection *GetInitialState(TensorUsage stateUsage, int batch);
+    void SetStateRead(Operation *op, int batch);
+    void SetStateWrite(Operation *op, TensorUsage stateUsage, int batch);
+    Operation *SetOutputWrite(TensorConnection *stateConn, int time, int batch);
+    TensorConnection *CalculateGate(const std::string &name, TensorConnection *featureConn, TensorConnection *stateConn,
+        TensorConnection *inputWeightConn, TensorConnection *inputBiasConn, TensorConnection *recurrentWeightConn,
+        OpType activationType, int batch);
+    TensorConnection *CalculateCellState(TensorConnection *cellStateConn, TensorConnection *inputGateConn,
+        TensorConnection *forgetGateConn, TensorConnection *cellGateConn, int time, int batch);
+    TensorConnection *CalculateOutputState(TensorConnection *outputGateConn, TensorConnection *cellStateConn, int time, int batch);
+    std::pair<TensorConnection *, TensorConnection *> Step(TensorConnection *featureConn,
+        TensorConnection *outputStateConn, TensorConnection *cellStateConn, int time, int batch);
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/operation.hpp b/ethosu/regor/compiler/operation.hpp
index c2a72a9e..bd3283be 100644
--- a/ethosu/regor/compiler/operation.hpp
+++ b/ethosu/regor/compiler/operation.hpp
@@ -122,6 +122,8 @@ struct TensorConnection
         rounding = r;
         return *this;
     }
+
+    const Shape &SliceShape() const { return slice.shape ? slice.shape : shape; }
 };
 
 
diff --git a/ethosu/regor/compiler/operation_util.hpp b/ethosu/regor/compiler/operation_util.hpp
index d3fc1edc..b77e8ed4 100644
--- a/ethosu/regor/compiler/operation_util.hpp
+++ b/ethosu/regor/compiler/operation_util.hpp
@@ -261,6 +261,33 @@ inline Operation *CreateRescaleAdd(const std::shared_ptr<Tensor> &ifm, const std
     return op;
 }
 
+inline Operation *CreateFullyConnected(const std::string &name, const std::shared_ptr<Tensor> &ifm,
+    const std::shared_ptr<Tensor> &weights, const Quantization &ifmQuantization, const Quantization &weightQuantization,
+    const Quantization &ofmQuantization, const Shape ifmShape, DataType ofmDtype = DataType::None,
+    std::shared_ptr<Tensor> bias = nullptr, const Quantization &biasQuantization = Quantization::Unit())
+{
+    int numOutputs = weights->StorageShape()[0];
+
+    auto op = std::make_shared<Operation>(OpType::FullyConnected);
+    op->ConnectInput(TensorUsage::IFM, ifm).Set(ifmShape).Set(ifmQuantization);
+    op->ConnectInput(TensorUsage::Weights, weights).Set(weights->StorageShape()).Set(weightQuantization);
+
+    if ( bias == nullptr )
+    {
+        DataType biasType = ifm->Type() == DataType::Int16 ? DataType::Int64 : DataType::Int32;
+        std::vector<uint8_t> zeroBuf(DataTypeStorageSizeBytes(biasType, 1), 0);
+        bias = CreateConstTensor(name + std::string("_bias"), biasType, std::make_shared<Buffer>(std::move(zeroBuf)));
+    }
+
+    op->ConnectInput(TensorUsage::Scales, bias).Set(Shape(numOutputs)).Set(biasQuantization);
+
+    // Setup OFM
+    if ( ofmDtype == DataType::None ) ofmDtype = ifm->Type();
+    auto ofm = std::make_shared<Tensor>(name + "_ofm", ofmDtype, Shape(ifmShape[0], numOutputs));
+    op->ConnectOutput(TensorUsage::OFM, ofm).Set(ofmQuantization);
+    return op.get();
+}
+
 inline TransposeType CalculateTransposeType(const Operation &operation)
 {
     const auto *paramsConn = operation.Input(TensorUsage::Params);
diff --git a/ethosu/regor/compiler/quantization.hpp b/ethosu/regor/compiler/quantization.hpp
index dd054078..19c8caf4 100644
--- a/ethosu/regor/compiler/quantization.hpp
+++ b/ethosu/regor/compiler/quantization.hpp
@@ -89,4 +89,11 @@ public:
     }
 };
 
+inline int64_t Quantize(float value, const Quantization &quant)
+{
+    float scale = quant.scales.empty() ? 1.0f : float(quant.scales[0].Dequantize());
+    int64_t zp = quant.zeroPoints.empty() ? 0 : quant.zeroPoints[0];
+    return zp + int64_t(std::round(double(value / scale)));
+}
+
 }  // namespace regor
diff --git a/ethosu/regor/compiler/scheduler.cpp b/ethosu/regor/compiler/scheduler.cpp
index df9c8d64..9beb52fd 100644
--- a/ethosu/regor/compiler/scheduler.cpp
+++ b/ethosu/regor/compiler/scheduler.cpp
@@ -206,8 +206,8 @@ int Scheduler::UpdateSchedulerTensor(TensorUsage usage, SchedulerConnection *con
         conn->requireFullTensor = true;
     }
 
-    // Force linear format for read only tensors
-    if ( tensor->IsConstant() )
+    // Force linear format for read only or persistent tensors
+    if ( tensor->IsConstant() || tensor->isPersistent )
     {
         tensor->needsLinearFormat = true;
     }
diff --git a/ethosu/regor/compiler/scheduler_packing.cpp b/ethosu/regor/compiler/scheduler_packing.cpp
index a6ff98b3..550f8df2 100644
--- a/ethosu/regor/compiler/scheduler_packing.cpp
+++ b/ethosu/regor/compiler/scheduler_packing.cpp
@@ -429,6 +429,18 @@ int SchedulerPacking::CanPack(const SchedulerOperation *schedOp, const Scheduler
         return 0;
     }
 
+    if ( schedOp->Type() == OpType::FullyConnected )
+    {
+        return 0;
+    }
+
+    // Do not pack persistent tensors with non persistent tensors
+    // if ( ifmTensor->isPersistent != prevOFM->isPersistent )
+    if ( prevOFM->isPersistent != nextOp->OFM()->tensor->isPersistent )
+    {
+        return 0;
+    }
+
     // Previous op in execution order doesn't connect to this one
     if ( prevOFM != ifmTensor && prevOFM != ifm2Tensor )
     {
@@ -497,6 +509,21 @@ void SchedulerPacking::InitSchedulerTensor(SchedulerTensor *schedTensor, Tensor
     schedTensor->isGraphOutput = graph->IsOutput(tensor);
     schedTensor->isPersistent = graph->IsPersistent(tensor);
     schedTensor->uid = tensor->Uid();
+    if ( tensor->View().HasBuffer() )
+    {
+        // Assign equivalenceIds based on the underlying buffer of the GraphIR tensor (if present).
+        // This ensures that all tensors sharing a buffer will be allocated to the same memory.
+        auto buffer = tensor->View().Buffer();
+        auto eqId = _bufferEquivalenceIdMap.find(buffer->Hash());
+        if ( eqId == _bufferEquivalenceIdMap.end() )
+        {
+            _bufferEquivalenceIdMap.emplace(buffer->Hash(), schedTensor->equivalenceId);
+        }
+        else
+        {
+            schedTensor->equivalenceId = eqId->second;
+        }
+    }
 }
 
 std::unique_ptr<SchedulerOperation> SchedulerPacking::MakeSchedulerOperation(Operation *op, const Graph *graph)
diff --git a/ethosu/regor/compiler/scheduler_packing.hpp b/ethosu/regor/compiler/scheduler_packing.hpp
index 40621851..eca43ca8 100644
--- a/ethosu/regor/compiler/scheduler_packing.hpp
+++ b/ethosu/regor/compiler/scheduler_packing.hpp
@@ -50,6 +50,7 @@ protected:
     bool _disableChaining = false;
     std::vector<std::unique_ptr<SchedulerOperation>> _schedList;
     std::unordered_map<Tensor *, std::shared_ptr<SchedulerTensor>> _tensorMap;
+    std::unordered_map<Hash128, UniqueId> _bufferEquivalenceIdMap;
 
 public:
     SchedulerPacking(Architecture *arch, bool disableChaining);
diff --git a/ethosu/regor/compiler/tflite_graph_optimiser.cpp b/ethosu/regor/compiler/tflite_graph_optimiser.cpp
index 4ac1eaa7..e3ecd8c7 100644
--- a/ethosu/regor/compiler/tflite_graph_optimiser.cpp
+++ b/ethosu/regor/compiler/tflite_graph_optimiser.cpp
@@ -26,6 +26,7 @@
 #include "common/transpose_type.hpp"
 #include "graph.hpp"
 #include "graph_optimiser.hpp"
+#include "lstm.hpp"
 #include "op_type.hpp"
 #include "operation.hpp"
 #include "optimiser_utils.hpp"
@@ -1701,6 +1702,16 @@ Operation *TFLiteGraphOptimiser::ConvertSoftmaxOps(Graph *const graph, Operation
     return _softmax->ConvertOp(operation);
 }
 
+Operation *TFLiteGraphOptimiser::ConvertLstmOps(Graph *const graph, Operation *const operation)
+{
+    if ( operation->Type() == OpType::UnidirectionalSequenceLstm )
+    {
+        auto lstmLowering = LSTM(operation, _db, graph);
+        return lstmLowering.ConvertOp();
+    }
+    return operation;
+}
+
 Operation *TFLiteGraphOptimiser::ConvertMeanOps(Graph *const, Operation *const operation)
 {
     auto returnOp = operation;
diff --git a/ethosu/regor/compiler/tflite_graph_optimiser.hpp b/ethosu/regor/compiler/tflite_graph_optimiser.hpp
index 72f90b0b..30710470 100644
--- a/ethosu/regor/compiler/tflite_graph_optimiser.hpp
+++ b/ethosu/regor/compiler/tflite_graph_optimiser.hpp
@@ -131,6 +131,8 @@ private:
 
     Operation *ConvertSoftmaxOps(Graph *const graph, Operation *const operation);
 
+    Operation *ConvertLstmOps(Graph *const graph, Operation *const operation);
+
     Operation *ConvertMeanOps(Graph *const, Operation *const operation);
 
     // Converts int8/uint8 Sigmoid and Tanh to a LUT based solution
@@ -253,6 +255,7 @@ public:
                 &TFLiteGraphOptimiser::ConvertLogToLUT,
                 &TFLiteGraphOptimiser::ConvertTanhSigmoidToLUT,
                 &TFLiteGraphOptimiser::ConvertSoftmaxOps,
+                &TFLiteGraphOptimiser::ConvertLstmOps,
                 &TFLiteGraphOptimiser::ReplacePadByExplicitPadding,
                 &TFLiteGraphOptimiser::ConvertMeanOps,
                 &TFLiteGraphOptimiser::ConvertPrelu,
diff --git a/ethosu/regor/test/test_passthrough.cpp b/ethosu/regor/test/test_passthrough.cpp
index 0698ead6..10c13f6f 100644
--- a/ethosu/regor/test/test_passthrough.cpp
+++ b/ethosu/regor/test/test_passthrough.cpp
@@ -515,7 +515,17 @@ TEST_CASE("passthrough")
     }
 
     {
-        // Generate simple output tensor
+        // Generate intermediate tensor (tensor index 4)
+        // Intermediates cannot be constant and must use buffer 0
+        const std::vector<int32_t> shape = {1, 11, 11, 3};
+        const tflite::TensorType type = tflite::TensorType::INT16;
+        const int bufferIndex = 0;
+        const std::string name = "intermediate";
+        tensors.push_back(tflite::CreateTensorDirect(fbb, &shape, type, bufferIndex, name.c_str()));
+    }
+
+    {
+        // Generate simple output tensor (tensor index 5)
         const std::vector<int32_t> shape = {1, 11, 11, 3};
         const tflite::TensorType type = tflite::TensorType::FLOAT32;
         const int bufferIndex = 0;
@@ -536,10 +546,10 @@ TEST_CASE("passthrough")
         // Generate 1 operator
         const uint32_t opcodeIndex = 0;
         const std::vector<int32_t> inputs = {0, 1, 2, 3};
-        const std::vector<int32_t> outputs = {4};
+        const std::vector<int32_t> intermediates = {4};
+        const std::vector<int32_t> outputs = {5};
         const std::vector<uint8_t> customOptions = random_vector<uint8_t>(5);
         const std::vector<uint8_t> mutatingVariableInputs = random_vector<uint8_t>(4);
-        const std::vector<int32_t> intermediates = random_vector<int32_t>(4);
 
         // Generate builtin_options or builtin_options2
         flatbuffers::Offset<> builtinOptions = 0;
@@ -567,7 +577,7 @@ TEST_CASE("passthrough")
     {
         // Generate 1 subgraph
         const std::vector<int32_t> inputs = {0 /* ifm0 */};
-        const std::vector<int32_t> outputs = {4 /* ofm */};
+        const std::vector<int32_t> outputs = {5 /* ofm */};
         const char *name = "subgraph1";
         subgraphs.push_back(tflite::CreateSubGraphDirect(fbb, &tensors, &inputs, &outputs, &operations, name));
     }
diff --git a/ethosu/regor/tflite/tflite_mapping.cpp b/ethosu/regor/tflite/tflite_mapping.cpp
index 27c31c39..f4b4eec0 100644
--- a/ethosu/regor/tflite/tflite_mapping.cpp
+++ b/ethosu/regor/tflite/tflite_mapping.cpp
@@ -630,8 +630,32 @@ const std::multimap<OpType, TensorUsage> TfLiteMapping::_inputTensorIndices = {
     {OpType::Transpose,                         TensorUsage::IFM0},
     {OpType::Transpose,                         TensorUsage::Params},
     {OpType::Unique,                            TensorUsage::IFM0},
+    // LSTM
     {OpType::UnidirectionalSequenceLstm,        TensorUsage::IFM0},
     {OpType::UnidirectionalSequenceLstm,        TensorUsage::Weights},
+    {OpType::UnidirectionalSequenceLstm,        MakeTensorUsage(TensorUsage::Weights, 1)},
+    {OpType::UnidirectionalSequenceLstm,        MakeTensorUsage(TensorUsage::Weights, 2)},
+    {OpType::UnidirectionalSequenceLstm,        MakeTensorUsage(TensorUsage::Weights, 3)},
+    {OpType::UnidirectionalSequenceLstm,        MakeTensorUsage(TensorUsage::Weights, 4)},
+    {OpType::UnidirectionalSequenceLstm,        MakeTensorUsage(TensorUsage::Weights, 5)},
+    {OpType::UnidirectionalSequenceLstm,        MakeTensorUsage(TensorUsage::Weights, 6)},
+    {OpType::UnidirectionalSequenceLstm,        MakeTensorUsage(TensorUsage::Weights, 7)},
+    {OpType::UnidirectionalSequenceLstm,        MakeTensorUsage(TensorUsage::Weights, 8)},
+    {OpType::UnidirectionalSequenceLstm,        MakeTensorUsage(TensorUsage::Weights, 9)},
+    {OpType::UnidirectionalSequenceLstm,        MakeTensorUsage(TensorUsage::Weights, 10)},
+    {OpType::UnidirectionalSequenceLstm,        TensorUsage::Scales},
+    {OpType::UnidirectionalSequenceLstm,        MakeTensorUsage(TensorUsage::Scales, 1)},
+    {OpType::UnidirectionalSequenceLstm,        MakeTensorUsage(TensorUsage::Scales, 2)},
+    {OpType::UnidirectionalSequenceLstm,        MakeTensorUsage(TensorUsage::Scales, 3)},
+    {OpType::UnidirectionalSequenceLstm,        MakeTensorUsage(TensorUsage::Weights, 11)},
+    {OpType::UnidirectionalSequenceLstm,        MakeTensorUsage(TensorUsage::Scales, 4)},
+    {OpType::UnidirectionalSequenceLstm,        TensorUsage::State},
+    {OpType::UnidirectionalSequenceLstm,        MakeTensorUsage(TensorUsage::State, 1)},
+    {OpType::UnidirectionalSequenceLstm,        MakeTensorUsage(TensorUsage::Scales, 5)},
+    {OpType::UnidirectionalSequenceLstm,        MakeTensorUsage(TensorUsage::Scales, 6)},
+    {OpType::UnidirectionalSequenceLstm,        MakeTensorUsage(TensorUsage::Scales, 7)},
+    {OpType::UnidirectionalSequenceLstm,        MakeTensorUsage(TensorUsage::Scales, 8)},
+    // RNN
     {OpType::UnidirectionalSequenceRnn,         TensorUsage::IFM0},
     {OpType::UnidirectionalSequenceRnn,         TensorUsage::Weights},
     {OpType::Unpack,                            TensorUsage::IFM0},
diff --git a/ethosu/regor/tflite/tflite_model_semantics.cpp b/ethosu/regor/tflite/tflite_model_semantics.cpp
index ca33f241..c27b97c7 100644
--- a/ethosu/regor/tflite/tflite_model_semantics.cpp
+++ b/ethosu/regor/tflite/tflite_model_semantics.cpp
@@ -245,9 +245,11 @@ void ConstraintEmptyConstTensors(const Model &m_model)
                     {
                         auto tensor = tensors[BoundsCheckedIndex(input, tensors)];
                         auto buffer = buffers[BoundsCheckedIndex(tensor->buffer(), buffers)];
-                        // Buffer 0 is a special buffer that is used for empty tensors
-                        if ( (tensor->buffer() > 0 && (!buffer->data() || buffer->data()->size() == 0) && buffer->offset() <= 1) ||
-                             (buffer->offset() > 1 && buffer->size() == 0) )
+                        // Buffer 0 is a special buffer that is used for empty tensors.
+                        // Variable tensors are also empty but are not forced to use Buffer 0.
+                        if ( !tensor->is_variable() &&
+                             ((tensor->buffer() > 0 && (!buffer->data() || buffer->data()->size() == 0) && buffer->offset() <= 1) ||
+                                 (buffer->offset() > 1 && buffer->size() == 0)) )
                         {
                             std::string constraint = "Constant tensors must not have empty buffers";
                             std::string extra = "Found Constant Tensor with empty buffer";
diff --git a/ethosu/regor/tflite/tflite_reader.cpp b/ethosu/regor/tflite/tflite_reader.cpp
index 9710d50a..d0e17bcf 100644
--- a/ethosu/regor/tflite/tflite_reader.cpp
+++ b/ethosu/regor/tflite/tflite_reader.cpp
@@ -90,6 +90,27 @@ static void SetKernel(const std::shared_ptr<Operation> &operation, const Point2i
     operation->SetKernel(std::move(kernel));
 }
 
+static void ReshapeFullyConnectedWeights(const std::shared_ptr<Operation> &operation, TensorUsage weightUsage)
+{
+    auto weight_tensor = operation->Input(weightUsage)->tensor;
+    if ( weight_tensor->AxisOrder() == AxisOrder::Unknown )
+    {
+        const auto &shape = weight_tensor->StorageShape();
+        // Reshape weight tensor from (num_outputs, ..., num_inputs) to (num_outputs, 1, 1, num_inputs)
+        if ( shape.Size() >= 2 && shape.Elements() == (shape[0] * shape[-1]) )
+        {
+            weight_tensor->Reshape(Shape(shape[0], 1, 1, shape[-1]));
+            weight_tensor->SetAxisOrder(AxisOrder::OHWI);
+            operation->Input(weightUsage)->shape = weight_tensor->StorageShape();
+        }
+    }
+    else
+    {
+        // Weight tensor has already been reshaped
+        assert(weight_tensor->AxisOrder() == AxisOrder::OHWI);
+    }
+}
+
 const tflite::Model *TfLiteReader::LoadModel(const void *input, size_t size)
 {
     const uint8_t *buffer = static_cast<const uint8_t *>(input);
@@ -192,6 +213,7 @@ void TfLiteReader::LoadGraphs(const uint8_t *input, const tflite::Model *model,
             assert(tflite_inputs);
             auto tflite_outputs = tflite_operator->outputs();
             assert(tflite_outputs);
+            auto tflite_intermediates = tflite_operator->intermediates();
             const auto &input_tensors = *tflite_inputs;  // A vector of indices into the `tensors` vector
             int indirect_index = 0;                      // An index into `input_tensors`
             int ifm_count = 0;
@@ -242,6 +264,19 @@ void TfLiteReader::LoadGraphs(const uint8_t *input, const tflite::Model *model,
                 placeholder.push_back(std::move(tensor));
             }
 
+            if ( tflite_intermediates )
+            {
+                // Connect operation to its intermediate tensors. They are added as inputs with usage Intermediate.
+                int intermediate_count = 0;
+                for ( const int tensor_index : *tflite_intermediates )
+                {
+                    const auto &intermediate = tensors.at(tensor_index);
+                    assert(tensorQuantization.count(intermediate->Uid()) > 0);
+                    operation->ConnectInput(MakeTensorUsage(TensorUsage::Scratch, intermediate_count++), intermediate)
+                        .Set(tensorQuantization[intermediate->Uid()]);
+                }
+            }
+
             // Connect operation to its output tensors
             int ofm_count = 0;
             for ( const int tensor_index : *tflite_outputs )
@@ -430,6 +465,14 @@ std::shared_ptr<Tensor> TfLiteReader::ParseTensor(const tflite::Tensor *tflite_t
         LOG_WARN("Tensor '{}' contains sparsity information, which is not supported and will be ignored.\n", name);
     }
 
+    if ( tflite_tensor->is_variable() )
+    {
+        // Create an empty buffer for variable tensor
+        assert(buffer == nullptr && "Unexpected buffer for variable tensor!");
+        auto emptyBuffer = std::make_shared<Buffer>(std::vector<int>{});
+        tensor->SetBuffer(emptyBuffer);
+    }
+
     tensor->SetPassthrough(tflite_tensor);
 
     return tensor;
@@ -510,23 +553,8 @@ void TfLiteReader::ParseOperatorOptions(
             const auto options = GetBuiltinOptions<tflite::FullyConnectedOptions>(tflite_operator);
             activation_function = options->fused_activation_function();
             // TODO: Are `weights_format`, `keep_num_dims` or `asymmetric_quantize_inputs` used?
+            ReshapeFullyConnectedWeights(operation, TensorUsage::Weights);
             auto weight_tensor = operation->Input(TensorUsage::Weights)->tensor;
-            if ( weight_tensor->AxisOrder() == AxisOrder::Unknown )
-            {
-                const auto &shape = weight_tensor->StorageShape();
-                // Reshape weight tensor from (num_outputs, ..., num_inputs) to (num_outputs, 1, 1, num_inputs)
-                if ( shape.Size() >= 2 && shape.Elements() == (shape[0] * shape[-1]) )
-                {
-                    weight_tensor->Reshape(Shape(shape[0], 1, 1, shape[-1]));
-                    weight_tensor->SetAxisOrder(AxisOrder::OHWI);
-                    operation->Input(TensorUsage::Weights)->shape = weight_tensor->StorageShape();
-                }
-            }
-            else
-            {
-                // Weight tensor has already been reshaped
-                assert(weight_tensor->AxisOrder() == AxisOrder::OHWI);
-            }
             if ( operation->Input(TensorUsage::Scales) == nullptr )
             {
                 // Op has no bias; add bias tensor filled with zeros
@@ -698,6 +726,23 @@ void TfLiteReader::ParseOperatorOptions(
         }
         break;
 
+        case tflite::BuiltinOptions::UnidirectionalSequenceLSTMOptions:
+        {
+            const auto options = GetBuiltinOptions<tflite::UnidirectionalSequenceLSTMOptions>(tflite_operator);
+            operation->Attribute<unidirectional_sequence_lstm_attr_t>()->cell_clip = options->cell_clip();
+            operation->Attribute<unidirectional_sequence_lstm_attr_t>()->projection_clip = options->proj_clip();
+            operation->Attribute<unidirectional_sequence_lstm_attr_t>()->time_major = options->time_major();
+
+            for ( int i = 0; i < 12; i++ )
+            {
+                if ( operation->Input(MakeTensorUsage(TensorUsage::Weights, i)) )
+                {
+                    ReshapeFullyConnectedWeights(operation, MakeTensorUsage(TensorUsage::Weights, i));
+                }
+            }
+        }
+        break;
+
         case tflite::BuiltinOptions::ResizeBilinearOptions:
         case tflite::BuiltinOptions::ResizeNearestNeighborOptions:
             break;
@@ -768,7 +813,6 @@ void TfLiteReader::ParseOperatorOptions(
         case tflite::BuiltinOptions::FillOptions:
         case tflite::BuiltinOptions::BidirectionalSequenceLSTMOptions:
         case tflite::BuiltinOptions::BidirectionalSequenceRNNOptions:
-        case tflite::BuiltinOptions::UnidirectionalSequenceLSTMOptions:
         case tflite::BuiltinOptions::FloorModOptions:
         case tflite::BuiltinOptions::RangeOptions:
         case tflite::BuiltinOptions::SquaredDifferenceOptions:
diff --git a/ethosu/regor/tflite/tflite_supported_operators.cpp b/ethosu/regor/tflite/tflite_supported_operators.cpp
index e0218087..f30933c1 100644
--- a/ethosu/regor/tflite/tflite_supported_operators.cpp
+++ b/ethosu/regor/tflite/tflite_supported_operators.cpp
@@ -950,6 +950,53 @@ bool TfLiteSupportedOperators::ConstraintLog(const Operation *op)
     return true;
 }
 
+bool TfLiteSupportedOperators::ConstraintLSTM(const Operation *op)
+{
+    OpType opType = op->Type();
+    if ( opType != OpType::UnidirectionalSequenceLstm )
+    {
+        return true;
+    }
+
+    for ( int i = 0; i <= 7; i++ )
+    {
+        // Check that all the gate weights are present. If they are not it's either invalid or using Couple
+        // Input and Forget Gate (CIFG), where the input gate is computed implicitly from the forget gate,
+        // which is not supported.
+        if ( op->Input(MakeTensorUsage(TensorUsage::Weights, i)) == nullptr )
+        {
+            Failure(op, "Missing gate weight tensor", "LSTM with implicit gate calculation is not supported");
+            return false;
+        }
+    }
+
+    for ( int i = 8; i <= 10; i++ )
+    {
+        if ( op->Input(MakeTensorUsage(TensorUsage::Weights, i)) )
+        {
+            Failure(op, "Peephole weight tensor present", "Peephole LSTM variant is not supported");
+            return false;
+        }
+    }
+
+    if ( op->Input(MakeTensorUsage(TensorUsage::Weights, 11)) || op->Input(MakeTensorUsage(TensorUsage::Scales, 4)) )
+    {
+        Failure(op, "Projection weight or bias tensor present", "LSTM with projection is not supported");
+        return false;
+    }
+
+    for ( int i = 5; i <= 8; i++ )
+    {
+        if ( op->Input(MakeTensorUsage(TensorUsage::Scales, i)) )
+        {
+            Failure(op, "Normalization coefficient tensor present", "LSTM with gate normalization is not supported");
+            return false;
+        }
+    }
+
+    return true;
+}
+
 void TfLiteSupportedOperators::Failure(const Operation *op, const std::string &message, const std::string &constraint)
 {
     assert(op);
@@ -1011,6 +1058,7 @@ TfLiteSupportedOperators::TfLiteSupportedOperators(IArchitectureConstraints *con
         &TfLiteSupportedOperators::ConstraintTransposeDims,
         &TfLiteSupportedOperators::ConstraintStridedSlice,
         &TfLiteSupportedOperators::ConstraintLog,
+        &TfLiteSupportedOperators::ConstraintLSTM,
     };
 }
 
diff --git a/ethosu/regor/tflite/tflite_supported_operators.hpp b/ethosu/regor/tflite/tflite_supported_operators.hpp
index 0b7126e7..811cb6c2 100644
--- a/ethosu/regor/tflite/tflite_supported_operators.hpp
+++ b/ethosu/regor/tflite/tflite_supported_operators.hpp
@@ -76,6 +76,7 @@ private:
     bool ConstraintTransposeDims(const Operation *op);
     bool ConstraintStridedSlice(const Operation *op);
     bool ConstraintLog(const Operation *op);
+    bool ConstraintLSTM(const Operation *op);
 };
 
 // Factory for supported-ops checkers
diff --git a/ethosu/regor/tflite/tflite_supported_operators_u55.cpp b/ethosu/regor/tflite/tflite_supported_operators_u55.cpp
index f49ccdb8..5bad24e3 100644
--- a/ethosu/regor/tflite/tflite_supported_operators_u55.cpp
+++ b/ethosu/regor/tflite/tflite_supported_operators_u55.cpp
@@ -80,6 +80,7 @@ TfLiteSupportedOperatorsU55::TfLiteSupportedOperatorsU55(IArchitectureConstraint
         OpType::HardSwish,
         OpType::MemoryCopy,
         OpType::Log,
+        OpType::UnidirectionalSequenceLstm,
         // clang-format on
     };
     _supportedDataTypes = {
diff --git a/ethosu/regor/tflite/tflite_supported_operators_u85.cpp b/ethosu/regor/tflite/tflite_supported_operators_u85.cpp
index 9b9fc635..db231e77 100644
--- a/ethosu/regor/tflite/tflite_supported_operators_u85.cpp
+++ b/ethosu/regor/tflite/tflite_supported_operators_u85.cpp
@@ -101,6 +101,7 @@ TfLiteSupportedOperatorsU85::TfLiteSupportedOperatorsU85(IArchitectureConstraint
         OpType::ReduceAll,
         OpType::MemoryCopy,
         OpType::Log,
+        OpType::UnidirectionalSequenceLstm,
         // clang-format on
     };
     _supportedDataTypes = {
diff --git a/ethosu/regor/tflite/tflite_writer.cpp b/ethosu/regor/tflite/tflite_writer.cpp
index 584635ee..19eebc06 100644
--- a/ethosu/regor/tflite/tflite_writer.cpp
+++ b/ethosu/regor/tflite/tflite_writer.cpp
@@ -169,12 +169,20 @@ std::unique_ptr<const uint8_t[]> TfLiteWriter::SerialiseImpl(const std::vector<s
                 _opcodes[opcode_desc] = opcode_index;
             }
 
-            std::vector<int> inputs, outputs;
+            std::vector<int> inputs, outputs, intermediates;
             for ( const auto &tensor : SortedInputTensors(operation, type) )
             {
                 // Skip placeholder tensors
                 if ( graph->IsPlaceholder(tensor) ) continue;
-                inputs.push_back(SerialisedTensorIndex(tensor, tensor_address_map, *graph));
+                if ( (operation->UsageOfTensor(tensor) & TensorUsage::TypeMask) == TensorUsage::Scratch )
+                {
+                    // Scratch usage means this is an intermediate tensor
+                    intermediates.push_back(SerialisedTensorIndex(tensor, tensor_address_map, *graph));
+                }
+                else
+                {
+                    inputs.push_back(SerialisedTensorIndex(tensor, tensor_address_map, *graph));
+                }
             }
             for ( const auto &connection : operation->Outputs() )
             {
@@ -188,7 +196,6 @@ std::unique_ptr<const uint8_t[]> TfLiteWriter::SerialiseImpl(const std::vector<s
             tflite::CustomOptionsFormat custom_options_format = tflite::CustomOptionsFormat::FLEXBUFFERS;
             flatbuffers::Offset<flatbuffers::Vector<uint8_t>> custom_options = 0;
             flatbuffers::Offset<flatbuffers::Vector<uint8_t>> mvi = 0;  // mutating_variable_inputs
-            flatbuffers::Offset<flatbuffers::Vector<int32_t>> intermediates = 0;
             uint64_t large_custom_options_offset = 0;
             uint64_t large_custom_options_size = 0;
 
@@ -212,16 +219,16 @@ std::unique_ptr<const uint8_t[]> TfLiteWriter::SerialiseImpl(const std::vector<s
                 custom_options_format = tflite_operator->custom_options_format();
                 custom_options = FlatbufferUtils::CopyVector<uint8_t>(_flatbuffer, tflite_operator->custom_options());
                 mvi = FlatbufferUtils::CopyVector<uint8_t>(_flatbuffer, tflite_operator->mutating_variable_inputs());
-                intermediates = FlatbufferUtils::CopyVector<int32_t>(_flatbuffer, tflite_operator->intermediates());
             }
 
             auto serialised_inputs = _flatbuffer.CreateVector<int32_t>(inputs);
+            auto serialised_intermediates = _flatbuffer.CreateVector<int32_t>(intermediates);
             auto serialised_outputs = _flatbuffer.CreateVector<int32_t>(outputs);
             auto serialised_options = SerialiseOptions(operation, type);
             auto serialised_options2 = SerialiseOptions2(operation, type);
 
             _serialised_operations.push_back(tflite::CreateOperator(_flatbuffer, opcode_index, serialised_inputs, serialised_outputs,
-                builtin_options_type, serialised_options, custom_options, custom_options_format, mvi, intermediates,
+                builtin_options_type, serialised_options, custom_options, custom_options_format, mvi, serialised_intermediates,
                 large_custom_options_offset, large_custom_options_size, builtin_options_2_type, serialised_options2));
         }
 
-- 
GitLab