diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp
index cfbcd53abf043040f504472a208195a054eb95ec..26eec5c1f2c129b42abe5ac514648df1f95b46e8 100644
--- a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp
@@ -497,11 +497,14 @@ int EthosU55RCSGenerator::CalcCommandWaits(const MemoryAccesses &opAccesses, std
 
 // Returns LUT slot to be used for the given LUT operation.
 // Sets alreadyInLutMem to true if the LUT is already in SHRAM.
-int EthosU55RCSGenerator::AllocateLutSlot(
-    std::vector<LutSlot> &lutSlots, const HLCOperation *op, int sizeInSlots, int timestamp, bool &alreadyInLutMem)
+int EthosU55RCSGenerator::AllocateLutSlot(const MemArea &memArea, Address address, int lutSize, int timestamp, bool &alreadyInLutMem)
 {
     alreadyInLutMem = false;
-    int totalSlots = int(lutSlots.size());
+    int lutSlotSize = _arch->_shram.lutSlotSize;
+    assert(lutSize % lutSlotSize == 0);
+
+    int sizeInSlots = lutSize / lutSlotSize;
+    int totalSlots = int(_lutSlots.size());
     if ( sizeInSlots < 0 || sizeInSlots > totalSlots )
     {
         assert(false);
@@ -511,22 +514,25 @@ int EthosU55RCSGenerator::AllocateLutSlot(
     int allocatedSlot = 0;
     for ( int i = 0; i < totalSlots; i += sizeInSlots )
     {
-        if ( lutSlots[i].hlcOp == op )
+        if ( _lutSlots[i].memory == memArea.memory && _lutSlots[i].address == address && _lutSlots[i].sizeBytes == lutSize )
         {
             // LUT is already in SHRAM
             allocatedSlot = i;
             alreadyInLutMem = true;
             break;
         }
-        if ( lutSlots[i].lastUsed < lutSlots[allocatedSlot].lastUsed )
+        assert(allocatedSlot < totalSlots);
+        if ( _lutSlots[i].lastUsed < _lutSlots[allocatedSlot].lastUsed )
         {
             allocatedSlot = i;
         }
     }
     for ( int j = allocatedSlot; j < allocatedSlot + sizeInSlots; ++j )
     {
-        lutSlots[j].hlcOp = op;
-        lutSlots[j].lastUsed = timestamp;
+        _lutSlots[j].memory = memArea.memory;
+        _lutSlots[j].address = address;
+        _lutSlots[j].sizeBytes = lutSize;
+        _lutSlots[j].lastUsed = timestamp;
     }
     return allocatedSlot;
 }
@@ -1225,20 +1231,18 @@ void EthosU55RCSGenerator::InsertLUTDMACommand(
 
     assert(op->type == OpType::LUT || (!op->subOps.empty() && op->subOps[0].type == OpType::LUT));
 
-    const auto &srcTens = op->type == OpType::LUT ? op->parameters.lut : op->subOps[0].parameters.lut;
+    const auto &lutTens = op->type == OpType::LUT ? op->parameters.lut : op->subOps[0].parameters.lut;
     assert(config->_layout.lutStart > 0);
-    assert(srcTens.sizeBytes % lutSlotSize == 0);
     bool alreadyInLutMem;
-    int sizeInSlots = srcTens.sizeBytes / lutSlotSize;
-    int slot = AllocateLutSlot(_lutSlots, op.get(), sizeInSlots, index, alreadyInLutMem);
+    int slot = AllocateLutSlot(lutTens.memArea, lutTens.address, lutTens.sizeBytes, index, alreadyInLutMem);
     _stripeToLutSlot[stripe] = slot;
 
     if ( !alreadyInLutMem )
     {
         auto dma = std::make_unique<HLCDMA>();
-        dma->srcMemArea = srcTens.memArea;
-        dma->srcAddress = srcTens.address;
-        dma->length = srcTens.sizeBytes;
+        dma->srcMemArea = lutTens.memArea;
+        dma->srcAddress = lutTens.address;
+        dma->length = lutTens.sizeBytes;
         dma->destMemArea = _arch->LUTMemory();
         dma->destAddress = _arch->_shram.bankSizeBytes * config->_layout.lutStart + slot * lutSlotSize;
         emitted.push_back(dma.get());
@@ -1940,8 +1944,7 @@ void EthosU55RCSGenerator::PrepareCommand(int index, HighLevelCommand *cmd, Temp
             // LUT is overwritten by SHRAM accumulator buffers; clear slots
             for ( auto &slot : _lutSlots )
             {
-                slot.hlcOp = nullptr;
-                slot.lastUsed = 0;
+                slot = {};
             }
         }
     }
@@ -1959,8 +1962,7 @@ std::vector<uint32_t> EthosU55RCSGenerator::GenerateCommandStream(
     // Clear lut slots at start of command stream generation
     for ( auto &slot : _lutSlots )
     {
-        slot.hlcOp = nullptr;
-        slot.lastUsed = 0;
+        slot = {};
     }
 
     GenerateInitialRegisterSetup();
@@ -2011,7 +2013,7 @@ std::vector<uint32_t> EthosU55RCSGenerator::GenerateCommandStream(
         // Return command mapping information to the caller
         if ( cmdRanges && cmd->IsStripe() )
         {
-            cmdRanges->emplace_back(static_cast<HLCStripe *>(cmd.get())->operation->_srcId, emitStart, _emit.Position());
+            cmdRanges->emplace_back(static_cast<HLCStripe *>(cmd.get())->operation->srcId, emitStart, _emit.Position());
         }
         cmdIndex++;
     }
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.hpp b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.hpp
index 233b90ba316c89e9c84ab89f0cc5afbfb1c350ee..662ce49a148eed65e8ce8c17d45e9651f8b2028c 100644
--- a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.hpp
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.hpp
@@ -111,7 +111,9 @@ using MemoryAccesses = std::vector<MemoryAccess>;
 
 struct LutSlot
 {
-    const HLCOperation *hlcOp = nullptr;
+    const ArchitectureMemory *memory = nullptr;
+    Address address = -1;
+    int sizeBytes = -1;
     int lastUsed = 0;
 };
 
@@ -164,7 +166,7 @@ protected:
     static int CalcCommandWaits(const MemoryAccesses &opAccesses, std::deque<MemoryAccesses> &outstanding);
     // Returns LUT slot to be used for the given LUT operation.
     // Sets alreadyInLutMem to true if the LUT is already in SHRAM.
-    int AllocateLutSlot(std::vector<LutSlot> &lutSlots, const HLCOperation *op, int sizeInSlots, int timestamp, bool &alreadyInLutMem);
+    int AllocateLutSlot(const MemArea &memArea, Address address, int lutSize, int timestamp, bool &alreadyInLutMem);
     //----------------------------------------------------------------------
     // Scaling (OFM/OPA/OPB_SCALE)
     //----------------------------------------------------------------------
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85.cpp
index bf13380a03b1292fd53c63d70053a7b51e69de45..c070b5297c65e483a4eb2d16e06b097765ed85f9 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85.cpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85.cpp
@@ -1421,19 +1421,6 @@ bool EthosU85OpGroup::Fuse(const ArchitectureOpGroupQuery &op, const std::vector
         return false;
     }
 
-    if ( _chainLength > 1 )
-    {
-        // TODO MLBEDSW-9142: support fusing on chained ops
-        return false;
-    }
-
-    // activation fusing..
-    if ( op.ifm[0].type == DataType::Int16 && (op.type == OpType::Sigmoid || op.type == OpType::Tanh) )
-    {
-        // Can not fuse int16 Sigmoid and Tanh LUT since they require special scaling done by AvgPoolNop
-        return false;
-    }
-
     if ( dependsOn.size() > 1 )
     {
         // Can only fuse with one op
@@ -1454,8 +1441,8 @@ bool EthosU85OpGroup::Fuse(const ArchitectureOpGroupQuery &op, const std::vector
     }
     const EthosU85OpGroup::OpInfo &prevOp = _ops[dep];
 
-    // Can't fuse activation with activation
-    if ( IsActivation(op.type) && _hasFusedActivation )
+    // Can't fuse two consecutive activations
+    if ( IsActivation(op.type) && IsActivation(prevOp.type) )
     {
         return false;
     }
@@ -1489,7 +1476,6 @@ bool EthosU85OpGroup::Fuse(const ArchitectureOpGroupQuery &op, const std::vector
         return false;
     }
 
-    _hasFusedActivation = _hasFusedActivation || IsActivation(op.type);
     _hasFusedTranspose = _hasFusedTranspose || (op.type == OpType::Transpose && !IsNone(op.ofm.transpose));
     _hasFusedReverse = _hasFusedReverse || (op.type == OpType::Reverse && op.ofm.reverse != ReverseType::None);
 
@@ -1512,11 +1498,6 @@ bool EthosU85OpGroup::Chain(const ArchitectureOpGroupQuery &op, const std::vecto
         // can only consider external (non-constant) inputs for chaining
         return false;
     }
-    if ( _opsCount > _chainLength )
-    {
-        // TODO MLBEDSW-9142: support chaining on fused ops
-        return false;
-    }
     if ( npuOp != EthosU85NpuOp::Elementwise )
     {
         return false;
@@ -1583,7 +1564,6 @@ int EthosU85OpGroup::Add(const ArchitectureOpGroupQuery &op, const std::vector<i
         _supportsFusing = ArchEthosU85::GetHWOp(op.type) != EthosU85NpuOp::Dma;
         _externalIfms = externalInputs;
         _chainLength = 1;
-        _hasFusedActivation = IsActivation(op.type);
         _hasFusedTranspose = (op.type == OpType::Transpose && !IsNone(op.ofm.transpose));
         _hasFusedReverse = (op.type == OpType::Reverse && op.ofm.reverse != ReverseType::None);
     }
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85.hpp
index d23e64f74ab020a9794fb5a580a336c9ed94195e..e6dbef09301e830e941d9fa9d9dcca618c0b5fec 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85.hpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85.hpp
@@ -126,7 +126,6 @@ private:
     int _chainIdx = 0;
     bool _supportsChaining = false;
     bool _supportsFusing = false;
-    bool _hasFusedActivation = false;
     bool _hasFusedTranspose = false;
     bool _hasFusedReverse = false;
 
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp
index c732b77ed7abd118a870395c58ba334d1376adde..564322a4395e80e974c653ea757309dca691d588 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp
@@ -657,10 +657,14 @@ int EthosU85RCSGenerator::CalcCommandWaits(const MemoryAccesses &opAccesses, std
 
 // Returns LUT slot to be used for the given LUT operation.
 // Sets alreadyInLutMem to true if the LUT is already in SHRAM.
-int EthosU85RCSGenerator::AllocateLutSlot(
-    std::vector<LutSlot> &lutSlots, const HLCOperation *op, int sizeInSlots, int timestamp, bool &alreadyInLutMem)
+int EthosU85RCSGenerator::AllocateLutSlot(std::vector<LutSlot> &lutSlots, const MemArea &memArea, Address address,
+    int lutSize, int timestamp, bool &alreadyInLutMem)
 {
     alreadyInLutMem = false;
+    int lutSlotSize = ArchEthosU85::LUT_SLOT_SIZE;
+    assert(lutSize % lutSlotSize == 0);
+
+    int sizeInSlots = lutSize / lutSlotSize;
     int totalSlots = int(lutSlots.size());
     if ( sizeInSlots < 0 || sizeInSlots > totalSlots )
     {
@@ -671,13 +675,14 @@ int EthosU85RCSGenerator::AllocateLutSlot(
     int allocatedSlot = 0;
     for ( int i = 0; i < totalSlots; i += sizeInSlots )
     {
-        if ( lutSlots[i].hlcOp == op )
+        if ( lutSlots[i].memory == memArea.memory && lutSlots[i].address == address && lutSlots[i].sizeBytes == lutSize )
         {
             // LUT is already in SHRAM
             allocatedSlot = i;
             alreadyInLutMem = true;
             break;
         }
+        assert(allocatedSlot < static_cast<int>(lutSlots.size()));
         if ( lutSlots[i].lastUsed < lutSlots[allocatedSlot].lastUsed )
         {
             allocatedSlot = i;
@@ -685,7 +690,9 @@ int EthosU85RCSGenerator::AllocateLutSlot(
     }
     for ( int j = allocatedSlot; j < allocatedSlot + sizeInSlots; ++j )
     {
-        lutSlots[j].hlcOp = op;
+        lutSlots[j].memory = memArea.memory;
+        lutSlots[j].address = address;
+        lutSlots[j].sizeBytes = lutSize;
         lutSlots[j].lastUsed = timestamp;
     }
     return allocatedSlot;
@@ -949,8 +956,7 @@ void EthosU85RCSGenerator::GeneratePadding(const HLCPadding &padding)
 void EthosU85RCSGenerator::GenerateActivation(const HLCStripe *stripe, MemoryAccesses &memoryAccesses)
 {
     const HLCOperation *op = stripe->operation.get();
-    OpType opType = OpType::None;
-    const HLCParameters *parameters = nullptr;
+    const HLCSubOperation *activationOp = nullptr;
     assert(stripe->opGroup != nullptr);
     EthosU85OpGroup *opGroup = static_cast<EthosU85OpGroup *>(stripe->opGroup);
     auto &ofm = op->ofm;
@@ -958,23 +964,18 @@ void EthosU85RCSGenerator::GenerateActivation(const HLCStripe *stripe, MemoryAcc
     if ( IsActivation(op->type) )
     {
         // Non-fused activation
-        opType = op->type;
-        parameters = &op->parameters;
+        activationOp = op;
     }
-    else
+    else if ( op->subOps.size() > 0 )
     {
-        for ( auto &subOp : op->subOps )
+        // Check if the first subOp is a fused activation.
+        auto &subOp = op->subOps[0];
+        if ( opGroup->IsFused(subOp.ifm[0].uid) && IsActivation(subOp.type) )
         {
-            if ( opGroup->IsFused(subOp.ifm[0].uid) && IsActivation(subOp.type) )
-            {
-                // Fused activation
-                opType = subOp.type;
-                parameters = &subOp.parameters;
-                // Use subOp ifm datatype to calculate clip range
-                clipDataType = subOp.ifm[0].dataType;
-                // We know there can be only one fused activation
-                break;
-            }
+            // Fused activation
+            activationOp = &subOp;
+            // Use subOp ifm datatype to calculate clip range
+            clipDataType = subOp.ifm[0].dataType;
         }
     }
 
@@ -997,13 +998,13 @@ void EthosU85RCSGenerator::GenerateActivation(const HLCStripe *stripe, MemoryAcc
 
     auto act = activation_function::LUT_NONE;
     uint32_t tableIndex = 0;
-    if ( IsLUTType(opType) )
+    if ( activationOp && IsLUTType(activationOp->type) )
     {
-        auto &lutParams = parameters->lut;
+        auto opType = activationOp->type;
+        auto &lutParams = activationOp->parameters.lut;
         int lutSize = lutParams.sizeBytes;
-
-        auto pos = _stripeToLutSlot.find(stripe);
-        if ( pos != _stripeToLutSlot.end() )
+        auto pos = _opToLutSlot.find(activationOp->srcId);
+        if ( pos != _opToLutSlot.end() )
         {
             tableIndex = pos->second;
         }
@@ -1569,14 +1570,33 @@ void EthosU85RCSGenerator::UpdateMemoryAccesses(const MemoryAccesses &memoryAcce
     }
 }
 
+std::unique_ptr<HLCDMA> EthosU85RCSGenerator::CreateLUTDMA(const HLCSubOperation *op, std::vector<LutSlot> &lutSlots, int timestamp)
+{
+    const auto &lutTens = op->parameters.lut;
+    bool alreadyInLutMem;
+    int slot = AllocateLutSlot(lutSlots, lutTens.memArea, lutTens.address, lutTens.sizeBytes, timestamp, alreadyInLutMem);
+    _opToLutSlot[op->srcId] = slot;
+
+    if ( !alreadyInLutMem )
+    {
+        auto dma = std::make_unique<HLCDMA>();
+        dma->srcMemArea = lutTens.memArea;
+        dma->srcAddress = lutTens.address;
+        dma->length = lutTens.sizeBytes;
+        dma->destMemArea = _arch->LUTMemory();
+        dma->destAddress = slot * ArchEthosU85::LUT_SLOT_SIZE;
+        return dma;
+    }
+    return nullptr;
+}
+
 // Inserts DMA commands for copying LUTs from constant memory
 // to LUT memory
 std::vector<std::unique_ptr<HighLevelCommand>>
 EthosU85RCSGenerator::InsertLUTDMACommands(std::vector<std::unique_ptr<HighLevelCommand>> &cmds)
 {
     std::vector<std::unique_ptr<HighLevelCommand>> result;
-    int lutSlotSize = ArchEthosU85::LUT_SLOT_SIZE;
-    int slots = int(_arch->_lutRam->SizeBytes() / lutSlotSize);
+    int slots = int(_arch->_lutRam->SizeBytes() / ArchEthosU85::LUT_SLOT_SIZE);
     std::vector<LutSlot> lutSlots(slots);
     int timestamp = 0;
     result.reserve(cmds.size());
@@ -1587,30 +1607,28 @@ EthosU85RCSGenerator::InsertLUTDMACommands(std::vector<std::unique_ptr<HighLevel
         {
             auto stripe = static_cast<HLCStripe *>(hlc.get());
             auto op = stripe->operation;
-            // TODO MLBEDSW-9142 LUT for chained subOps should be inserted before the primary Op
-            const auto &subOps = stripe->operation->subOps;
-            auto lutSubOp = std::find_if(
-                subOps.begin(), subOps.end(), [](const auto &subOp) { return IsLUTType(subOp.type); });
-            if ( IsLUTType(op->type) || (lutSubOp != subOps.end()) )
+
+            if ( IsLUTType(op->type) )
             {
-                const auto &srcTens = IsLUTType(op->type) ? op->parameters.lut : lutSubOp->parameters.lut;
-                assert(srcTens.sizeBytes % lutSlotSize == 0);
-                bool alreadyInLutMem;
-                int sizeInSlots = srcTens.sizeBytes / lutSlotSize;
-                int slot = AllocateLutSlot(lutSlots, op.get(), sizeInSlots, timestamp, alreadyInLutMem);
-                _stripeToLutSlot[stripe] = slot;
-
-                if ( !alreadyInLutMem )
+                // Create and insert LUT DMA for a primary op activation
+                if ( auto dma = CreateLUTDMA(op.get(), lutSlots, timestamp) )
                 {
-                    auto dma = std::make_unique<HLCDMA>();
-                    dma->srcMemArea = srcTens.memArea;
-                    dma->srcAddress = srcTens.address;
-                    dma->length = srcTens.sizeBytes;
-                    dma->destMemArea = _arch->LUTMemory();
-                    dma->destAddress = slot * lutSlotSize;
                     result.push_back(std::move(dma));
                 }
             }
+
+            // Create and insert LUT DMAs for any fused activations in the opgroup
+            const auto &subOps = stripe->operation->subOps;
+            for ( auto subOp = subOps.begin(); subOp != subOps.end(); subOp++ )
+            {
+                if ( IsLUTType(subOp->type) )
+                {
+                    if ( auto dma = CreateLUTDMA(&(*subOp), lutSlots, timestamp) )
+                    {
+                        result.push_back(std::move(dma));
+                    }
+                }
+            }
         }
         result.push_back(std::move(hlc));
     }
@@ -2007,7 +2025,7 @@ std::shared_ptr<HLCStripe> EthosU85RCSGenerator::MakeStripeForSubOp(HLCStripe *s
     op->type = subOp.type;
     op->ifm = subOp.ifm;
     op->ofm = subOp.ofm;
-    op->_srcId = subOp._srcId;
+    op->srcId = subOp.srcId;
     if ( IsLUTType(subOp.type) )
     {
         op->parameters.lut = subOp.parameters.lut;
@@ -2099,7 +2117,7 @@ bool EthosU85RCSGenerator::GenerateOpGroup(HLCStripe *stripe, HLCStripe *prevOp,
         // Return command mapping information to the caller
         if ( cmdRanges )
         {
-            cmdRanges->emplace_back(stripe->operation->_srcId, emitStart, _emit.Position());
+            cmdRanges->emplace_back(stripe->operation->srcId, emitStart, _emit.Position());
         }
 
         if ( isChained )
@@ -2214,7 +2232,7 @@ std::vector<uint32_t> EthosU85RCSGenerator::GenerateCommandStream(
     std::vector<std::unique_ptr<HighLevelCommand>> &highLevelCommandStream, CmdRanges *cmdRanges, bool verbose)
 {
     _emit.Clear();
-    _stripeToLutSlot.clear();
+    _opToLutSlot.clear();
     GenerateInitialRegisterSetup();
     auto cmds = InsertLUTDMACommands(highLevelCommandStream);
     cmds = InsertTileDMACommands(cmds);
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.hpp
index d37906246c9ec5f09e348767a60cbf7896690a99..a1f7ced2c11802db3d3f2eaaec4e67757165ebd1 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.hpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.hpp
@@ -100,7 +100,9 @@ using MemoryAccesses = std::vector<MemoryAccess>;
 
 struct LutSlot
 {
-    const HLCOperation *hlcOp = nullptr;
+    const ArchitectureMemory *memory = nullptr;
+    Address address = -1;
+    int sizeBytes = -1;
     int lastUsed = 0;
 };
 
@@ -148,7 +150,8 @@ protected:
     static int CalcCommandWaits(const MemoryAccesses &opAccesses, std::deque<MemoryAccesses> &outstanding);
     // Returns LUT slot to be used for the given LUT operation.
     // Sets alreadyInLutMem to true if the LUT is already in SHRAM.
-    int AllocateLutSlot(std::vector<LutSlot> &lutSlots, const HLCOperation *op, int sizeInSlots, int timestamp, bool &alreadyInLutMem);
+    int AllocateLutSlot(std::vector<LutSlot> &lutSlots, const MemArea &memArea, Address address, int lutSize,
+        int timestamp, bool &alreadyInLutMem);
     //----------------------------------------------------------------------
     // Scaling (OFM/IFM/IFM2_SCALE)
     //----------------------------------------------------------------------
@@ -209,6 +212,8 @@ protected:
     void GenerateWaits(bool isKernelWait, const MemoryAccesses &memoryAccesses, std::deque<MemoryAccesses> &outstandingAccesses);
     // Save current memory accesses to accessesToUpdate
     void UpdateMemoryAccesses(const MemoryAccesses &memoryAccesses, std::deque<MemoryAccesses> &accessesToUpdate, int maxWaits);
+    // Create the LUT DMA command required for the given HLCSubOperation
+    std::unique_ptr<HLCDMA> CreateLUTDMA(const HLCSubOperation *op, std::vector<LutSlot> &lutSlots, int timestamp);
     // Inserts DMA commands for copying LUTs from constant memory
     // to LUT memory
     std::vector<std::unique_ptr<HighLevelCommand>> InsertLUTDMACommands(std::vector<std::unique_ptr<HighLevelCommand>> &cmds);
@@ -251,8 +256,8 @@ public:
 
 private:
     ArchEthosU85 *_arch;
-    // For stripes that use LUT: the LUT slot to be used
-    std::unordered_map<const HLCStripe *, int> _stripeToLutSlot;
+    // For operations that use LUT: the LUT slot to be used
+    std::unordered_map<UniqueId, int> _opToLutSlot;
     EthosU85Emitter _emit;
 };
 
diff --git a/ethosu/regor/compiler/high_level_command_stream.hpp b/ethosu/regor/compiler/high_level_command_stream.hpp
index 463aea5666adc47d97be4fbdae9177190d57b4c0..7a4ba72379320cc297e022d633748fff2b0249ed 100644
--- a/ethosu/regor/compiler/high_level_command_stream.hpp
+++ b/ethosu/regor/compiler/high_level_command_stream.hpp
@@ -159,7 +159,7 @@ struct HLCSubOperation
     std::vector<HLCFeatureMap> ifm;
     HLCFeatureMap ofm;
     HLCParameters parameters = {};
-    UniqueId _srcId = 0;
+    UniqueId srcId = 0;
 };
 
 /// <summary>
diff --git a/ethosu/regor/compiler/high_level_command_stream_generator.cpp b/ethosu/regor/compiler/high_level_command_stream_generator.cpp
index b661fe34f4dbf3b71a563f99dcbea711d788ca0e..0141d6a4b5a9d61bd74da73aaae9f183693d857d 100644
--- a/ethosu/regor/compiler/high_level_command_stream_generator.cpp
+++ b/ethosu/regor/compiler/high_level_command_stream_generator.cpp
@@ -311,7 +311,7 @@ static HLCSubOperation MakeSubOperation(const std::unique_ptr<SchedulerOperation
     }
     MakeFeatureMap(TensorUsage::OFM, schedOp->OFM(), hlcSubOp.ofm);
 
-    hlcSubOp._srcId = schedOp->Uid();
+    hlcSubOp.srcId = schedOp->Uid();
 
     if ( schedOp->Type() == OpType::LeakyRelu )
     {
@@ -337,7 +337,7 @@ static std::shared_ptr<HLCOperation> MakeOperation(SchedulerOperation *schedOp,
     op->type = schedOp->Type();
     op->kernel = *schedOp->Kernel();
     op->config = opInfo->Config();
-    op->_srcId = schedOp->Uid();
+    op->srcId = schedOp->Uid();
     size_t ifms = 0;
     for ( const auto &input : schedOp->inputs.pairs() )
     {
@@ -783,7 +783,7 @@ void HLCStreamGenerator::GenerateCommandsForCascade(vector_span<std::unique_ptr<
         {
             auto &shape = item->second.shape;
             hlcOps[i - 1]->ofm.shape = shape;
-            // TODO MLBEDSW-9142: support fused activations inside chains
+            // TODO MLBEDSW-9143: support cascading of chains
             // for now, we assume maximum one subOp (fused activation) on cascades
             if ( hlcOps[i - 1]->subOps.size() )
             {
diff --git a/ethosu/regor/compiler/live_range.cpp b/ethosu/regor/compiler/live_range.cpp
index 856973c610f387286eb3a2db18a833ce06b0caaa..8b2bf156e3c66c6a4dcbcee6b95ecca946c4c24d 100644
--- a/ethosu/regor/compiler/live_range.cpp
+++ b/ethosu/regor/compiler/live_range.cpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2021-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -76,14 +76,17 @@ void LiveRangeGraph::ExtractLiveRangesFromCascades(const std::vector<std::unique
             {
                 auto opGroup = schedOp->OpGroup();
                 assert(opGroup != nullptr);
-                if ( opGroup->NeedsAllocation(schedOp->OFM()->tensor->uid) )
+
+                // Get the ofm of the last operator in the group
+                auto opGroupOfm = schedOp->SubOps().size() ? schedOp->SubOps().back()->OFM() : schedOp->OFM();
+                if ( opGroup->NeedsAllocation(opGroupOfm->tensor->uid) )
                 {
                     // Check if op have an ifm tensor that can be reused for the ofm
-                    auto ifmTens = ReusableIFM(schedOp, targetMemory);
+                    auto ifmTens = ReusableIFM(schedOp, opGroupOfm, targetMemory);
                     if ( ifmTens != nullptr )
                     {
                         // ifm can be reused
-                        FuseRanges(ifmTens, schedOp->OFM()->tensor.get());
+                        FuseRanges(ifmTens, opGroupOfm->tensor.get());
                     }
                 }
             }
@@ -220,13 +223,16 @@ LiveRange *LiveRangeGraph::FuseRanges(SchedulerTensor *inTens, SchedulerTensor *
     return lr;
 }
 
-SchedulerTensor *LiveRangeGraph::ReusableIFM(const std::unique_ptr<SchedulerOperation> &schedOp, const MemArea &targetMemory)
+// Check if any of the IFMs consumed by the first operator in an opgroup can be reused for the OFM
+// tensor of the last operator in the opgroup.
+// Requires the first operator to be an elementwise operator and is also applicaple to stand-alone
+// elementwise operators (which are just opgroups of length 1).
+SchedulerTensor *LiveRangeGraph::ReusableIFM(
+    const std::unique_ptr<SchedulerOperation> &schedOp, const SchedulerConnection *ofmConn, const MemArea &targetMemory)
 {
     SchedulerTensor *reusableIfm = nullptr;
     if ( IsElementwise(schedOp->Type()) )
     {
-        // Check if possible to merge ifm/ofm live ranges of elementwise op
-        const auto ofmConn = schedOp->OFM();
         const auto ofmTens = ofmConn->tensor.get();
 
         if ( !ShouldBeIgnored(ofmTens, targetMemory) )
diff --git a/ethosu/regor/compiler/live_range.hpp b/ethosu/regor/compiler/live_range.hpp
index 51048fd1d6154f9292c0fda83297e5334a76eb01..c7b57fca597971353fa4065f729d51ea36a0f9ee 100644
--- a/ethosu/regor/compiler/live_range.hpp
+++ b/ethosu/regor/compiler/live_range.hpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2021-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -105,7 +105,8 @@ public:
         Schedule *schedule, const MemArea &targetMemory, bool addRollingBuffers);
     LiveRange *GetOrCreateRange(SchedulerTensor *tens);
     LiveRange *FuseRanges(SchedulerTensor *inTens, SchedulerTensor *outTens);
-    SchedulerTensor *ReusableIFM(const std::unique_ptr<SchedulerOperation> &schedOp, const MemArea &targetMemory);
+    SchedulerTensor *ReusableIFM(const std::unique_ptr<SchedulerOperation> &schedOp, const SchedulerConnection *ofmConn,
+        const MemArea &targetMemory);
     virtual bool ShouldBeIgnored(SchedulerTensor *tens, const MemArea &targetMemory);
 };
 
diff --git a/ethosu/regor/compiler/scheduler_packing.cpp b/ethosu/regor/compiler/scheduler_packing.cpp
index 956daf6c013b5a92c692e74e818ea06e54348e70..2810c792db7da47f806f43a56358429849ed7a23 100644
--- a/ethosu/regor/compiler/scheduler_packing.cpp
+++ b/ethosu/regor/compiler/scheduler_packing.cpp
@@ -281,10 +281,10 @@ void SchedulerPacking::SchedulerPacking::PackOperations()
                 LOG_TRACE1("Added {} (key {}) to {} (key {})\n", OpTypeToString(nextOp->Type()), key,
                     OpTypeToString(prevOp->Type()), prevOpKey);
 
-                // Replace primary op's OFM by nextOp's OFM
+                // Replace previous op's OFM by nextOp's OFM
                 if ( IsActivation(nextOp->Type()) )
                 {
-                    auto *ofmConn = primaryOp->OFM();
+                    auto *ofmConn = prevOp->OFM();
                     ofmConn->tensor = nextOp->OFM()->tensor;
                     ofmConn->SetType(nextOp->OFM()->Type());
                     ofmConn->quantization.quantMin = nextOp->Output(TensorUsage::OFM)->quantization.quantMin;
@@ -450,7 +450,7 @@ int SchedulerPacking::CanPack(const SchedulerOperation *schedOp, const Scheduler
         return 0;
     }
 
-    if ( schedOp->OFM()->tensor->isGraphOutput )
+    if ( schedOp->OFM()->tensor->isGraphOutput || prevOp->OFM()->tensor->isGraphOutput )
     {
         return 0;
     }