From 824e8cf4ff9da5d1305b14462118d799ffe16d86 Mon Sep 17 00:00:00 2001
From: Philip Hall <philip.hall@arm.com>
Date: Thu, 5 Dec 2024 17:59:04 +0000
Subject: [PATCH] MLBEDSW-10102: Implement Ethos-U55 transposes

This commit implements 2-axis transposes for up to
rank 3 shapes using AveragePool for both Ethos-U55
and Ethos-U65.

 - Updated shape stride generator to allow asymmetric
   rank granules.
 - Allow null opgroups for substituted stripes in the
   Ethos-U55 RCS generator.
 - Implementation also falls back to a memory
   copy for non-transposes that may reach the RCS.

Signed-off-by: Philip Hall <philip.hall@arm.com>
Change-Id: I09082e71c931a7a9f433ddb1bdc4ed1128e2e95b
---
 .../ethosu55/ethos_u55_constraints.cpp        |   8 +
 .../ethos_u55_register_cs_generator.cpp       | 189 +++++++++++++++++-
 .../ethos_u55_register_cs_generator.hpp       |   3 +-
 ethosu/regor/common/numeric_util.hpp          |  12 ++
 ethosu/regor/common/shape.hpp                 |  26 ++-
 ethosu/regor/compiler/kernel.hpp              |   2 +
 ethosu/regor/compiler/scheduler.cpp           |  34 +++-
 ethosu/regor/compiler/scheduler_decompose.cpp |   5 +-
 ethosu/regor/compiler/scheduler_operation.hpp |   9 +-
 9 files changed, 264 insertions(+), 24 deletions(-)
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp
index 62caffba..49c66485 100644
--- a/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp
@@ -37,6 +37,14 @@ bool EthosU55Constraints::SupportsMatMul(OpType opType)
 TransposeSupport EthosU55Constraints::SupportsTranspose(OpType opType, TransposeType transposeType)
 {
     if ( IsNone(transposeType) ) return TransposeSupport::Any;
+
+    if ( opType == OpType::Transpose )
+    {
+        if ( transposeType == TransposeType::NWHC || transposeType == TransposeType::NHCW || transposeType == TransposeType::NCWH )
+        {
+            return TransposeSupport::NHWC;
+        }
+    }
     return TransposeSupport::None;
 }
 
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp
index b8a662e3..e7a378ee 100644
--- a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp
@@ -368,10 +368,7 @@ TileBox EthosU55RCSGenerator::GetTiles(const HLCFeatureMap &fm, const Shape &str
     }
     if ( fm.format == TensorFormat::NHCWB16 )
     {
-        for ( int i = 0; i < 4; ++i )
-        {
-            assert(tiles.address[i] % 16 == 0 && "NHCWB16 base address is not 16-byte aligned");
-        }
+        assert((tiles.address[0] | tiles.address[1] | tiles.address[2] | tiles.address[3]) % 16 == 0 && "NHCWB16 base address is not 16-byte aligned");
     }
     return tiles;
 }
@@ -757,8 +754,6 @@ void EthosU55RCSGenerator::GenerateActivation(const HLCStripe *stripe, MemoryAcc
     assert(op->subOps.size() <= 1);
     OpType opType = OpType::None;
     const HLCParameters *parameters = nullptr;
-    assert(stripe->opGroup != nullptr);
-    EthosU55OpGroup *opGroup = static_cast<EthosU55OpGroup *>(stripe->opGroup);
     if ( IsActivation(op->type) )
     {
         // Non-fused activation
@@ -766,7 +761,8 @@ void EthosU55RCSGenerator::GenerateActivation(const HLCStripe *stripe, MemoryAcc
         parameters = &op->parameters;
         assert(op->subOps.empty() || opType == op->subOps[0].type);
     }
-    else if ( !op->subOps.empty() && !opGroup->NeedsAllocation(op->subOps[0].ifm[0].uid) )
+    else if ( !op->subOps.empty() &&
+              (stripe->opGroup && !static_cast<EthosU55OpGroup *>(stripe->opGroup)->NeedsAllocation(op->subOps[0].ifm[0].uid)) )
     {
         // Fused activation
         assert(IsActivation(op->subOps[0].type));
@@ -1242,6 +1238,180 @@ void EthosU55RCSGenerator::InsertTileDMACommand(const HLCStripe *stripe, Tempora
     }
 }
 
+static inline int FirstSwapped(unsigned transpose, int &from)
+{
+    unsigned mask = unsigned(transpose) ^ unsigned(TransposeType::None);
+    for ( int i = 0; i < 8; i++ )
+    {
+        if ( mask & 0xF )
+        {
+            from = (transpose >> (i * 4)) & 0xF;
+            return i;
+        }
+        mask = mask >> 4;
+    }
+    from = 0;
+    return -1;
+}
+
+void EthosU55RCSGenerator::InsertTransposeCommand(const HLCStripe *stripe, Temporaries &temps, std::vector<const HighLevelCommand *> &emitted)
+{
+    auto op = stripe->operation;
+    auto &ifm = op->ifm[0];
+    auto &ofm = op->ofm;
+
+    assert(op->subOps.empty());
+    assert(ifm.format == TensorFormat::NHWC);
+    assert(ofm.format == TensorFormat::NHWC);
+    assert(ifm.shape.Size() <= 4);
+    ifm.shape = Shape::PadAxes(ifm.shape, 4, 0);
+    Shape outShape = ifm.shape.Permute(unsigned(ofm.transpose));
+
+    // Which indexed axes have been swapped
+    unsigned swapMask = unsigned(ofm.transpose) ^ unsigned(TransposeType::None);
+    unsigned validMask = ofm.shape.ShapeMask();
+    bool identity = (swapMask == 0) || (outShape.EqualMask(ofm.shape.WithOnes()) == validMask);
+    if ( identity )
+    {
+        LOG_WARN("RCS: Emitting no-op transpose as a memory copy\n");
+        auto dma = std::make_unique<HLCDMA>();
+        dma->srcMemArea = ifm.memArea;
+        dma->srcAddress = ifm.address;
+        dma->length = DataTypeStorageSizeBytes(ifm.dataType, ifm.shape.Elements());
+        dma->destMemArea = ofm.memArea;
+        dma->destAddress = ofm.address;
+        emitted.push_back(dma.get());
+        temps.cmds.push_back(std::move(dma));
+    }
+    else
+    {
+        // Strided output on AveragePool can swap Height/Width over any channel depth by
+        // adjusting the output strides to place the channel arrays in the required layout.
+        //
+        // IFM [h_pos, w_pos] = h_pos * ifm_stride_h + w_pos * ifm_stride_w
+        // OFM [h_pos, w_pos] = h_pos * ofm_stride_w + w_pos * ofm_stride_h (stride has been swapped)
+        //
+        // Example:
+        // Shape (2,3)            transposed to Shape (3,2)
+        // |0|1|2| ifm_stride_w = 1             |0|3| ofm_stride_w = 1
+        // |4|5|6| ifm_stride_h = 3             |1|4| ofm_stride_h = 2
+        //                                      |2|5|
+        //
+        // This can be used to implement any 2-axis channel swap for 3 axes or fewer.
+        //   NWHC - Transpose volume in a single pass.
+        //   NHCW - Transpose 'height' number of CW1 slices.
+        //   NCWH - Transpose 'width' number of HW1 slices (requires extra IFM striding).
+        HLCFeatureMap inFM = ifm;
+        HLCFeatureMap outFM = ofm;
+
+        // Only two axis swaps can be achieved using AvgPool
+        if ( NonZeroNybbles(swapMask) == 2 )
+        {
+            int elementSize = DataTypeSizeBits(ofm.dataType) / 8;
+
+            // Can only swap 2 axes at once using this method
+            int from;
+            int to = FirstSwapped(unsigned(ofm.transpose), from);
+            from = ifm.shape.Size() - 1 - from;
+            to = ifm.shape.Size() - 1 - to;
+
+            // Place the swappable axes in H/W (works in elements here)
+            outFM.shape = Shape(1, ifm.shape[from], ifm.shape[to], 1);
+            int depth = 1, slices = 1;
+            int ifmStep = 0;
+            int ofmStep = 0;
+
+            // Not all elements participate in the transposed axes
+            if ( outFM.shape.Elements() != ifm.shape.Elements() )
+            {
+                if ( ofm.transpose == TransposeType::NWHC )
+                {
+                    depth = ifm.shape.Depth();
+                    slices = 1;
+                    ifmStep = ofmStep = 0;
+                    assert(from == 1 && to == 2);
+                }
+                else if ( ofm.transpose == TransposeType::NHCW )
+                {
+                    depth = 1;
+                    slices = ifm.shape.Height();
+                    ifmStep = ofmStep = ifm.shape.Depth() * ifm.shape.Width() * elementSize;
+                    assert(from == 2 && to == 3);
+                }
+                else if ( ofm.transpose == TransposeType::NCWH )
+                {
+                    depth = 1;
+                    slices = ifm.shape.Width();
+                    ifmStep = ifm.shape.Depth() * elementSize;
+                    ofmStep = ifm.shape.Height() * elementSize;
+                    assert(from == 1 && to == 3);
+                }
+                else
+                {
+                    assert(false && "Unsupported transpose");
+                }
+            }
+
+            // Recalculate destination as same as source but with output different strides
+            outFM.shape = Shape(1, ifm.shape[from], ifm.shape[to], depth * elementSize);
+            inFM.shape = outFM.shape;
+            // Shapes are measured in terms of bytes, not elements.
+            outFM.dataType = DataType::Int8;
+            inFM.dataType = DataType::Int8;
+
+            // Special case for IFM with sparse strides
+            if ( (slices > 1) && (ofm.transpose == TransposeType::NCWH) )
+            {
+                outFM.strides = Shape(1, elementSize, elementSize * ifm.shape.Width() * ifm.shape.Height(), elementSize);
+                inFM.strides = Shape(1, elementSize * ifm.shape.Width() * ifm.shape.Depth(), elementSize, elementSize);
+            }
+            else
+            {
+                outFM.strides = Shape(1, elementSize * depth, elementSize * depth * outFM.shape.Height(), elementSize);
+                inFM.strides = Shape::GetStridesForShape(inFM.shape, 1);
+            }
+
+            // Repeat the transpose at advancing offsets for each slice
+            for ( int i = 0; i < slices; i++ )
+            {
+                // Create new stripe operations
+                auto cmd = std::make_unique<HLCStripe>(*stripe);
+                cmd->operation = std::make_shared<HLCOperation>();
+                cmd->operation->kernel = Kernel::UnitKernel();
+                cmd->operation->type = OpType::AvgPool;
+                cmd->opGroup = nullptr;
+                cmd->operation->ifm.push_back(inFM);
+                cmd->operation->ofm = outFM;
+                cmd->ofmArea = outFM.shape;
+                cmd->ifmAreas[0] = inFM.shape;
+
+                // Find a common block configuration
+                if ( i == 0 )
+                {
+                    ArchitectureConfigQuery query{};
+                    query.kernel = &cmd->operation->kernel;
+                    query.ifmBits = DataTypeSizeBits(ifm.dataType);
+                    query.ifmShape[0] = inFM.shape;
+                    query.ofmShape = outFM.shape;
+                    query.ofmFormat = TensorFormat::NHWC;
+                    query.transpose = ofm.transpose;
+                    temps.configs.push_back(_arch->FindBlockConfig(cmd->operation->type, query));
+                }
+                cmd->operation->config = temps.configs.back().get();
+                // Add to CMD list
+                emitted.push_back(cmd.get());
+                temps.cmds.push_back(std::move(cmd));
+                // Move to next slice
+                inFM.address += ifmStep;
+                outFM.address += ofmStep;
+            }
+        }
+        else
+        {
+            assert(false && "3-axis swaps must be decomposed");
+        }
+    }
+}
 
 
 //----------------------------------------------------------------------
@@ -1497,6 +1667,11 @@ void EthosU55RCSGenerator::PrepareCommand(int index, HighLevelCommand *cmd, Temp
         {
             InsertLUTDMACommand(index, stripe, temps, emitted);
         }
+        else if ( op->type == OpType::Transpose )
+        {
+            InsertTransposeCommand(stripe, temps, emitted);
+            return;
+        }
         else if ( _arch->_shram.reservedEndBanks == 0 )
         {
             // LUT is overwritten by SHRAM accumulator buffers; clear slots
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.hpp b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.hpp
index 26785525..8c01bff9 100644
--- a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.hpp
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.hpp
@@ -236,11 +236,12 @@ protected:
     void InsertLUTDMACommand(int index, const HLCStripe *stripe, Temporaries &temps, std::vector<const HighLevelCommand *> &emitted);
     // Inserts DMA commands to handle TILE operations
     virtual void InsertTileDMACommand(const HLCStripe *stripe, Temporaries &temps, std::vector<const HighLevelCommand *> &emitted);
+    // Inserts commands to handle transposing
+    virtual void InsertTransposeCommand(const HLCStripe *stripe, Temporaries &temps, std::vector<const HighLevelCommand *> &emitted);
 
     //----------------------------------------------------------------------
     // Operations
     //----------------------------------------------------------------------
-
     struct AccessTracking
     {
         std::deque<MemoryAccesses> outstandingNpuAccesses;
diff --git a/ethosu/regor/common/numeric_util.hpp b/ethosu/regor/common/numeric_util.hpp
index d3406ae4..291ada24 100644
--- a/ethosu/regor/common/numeric_util.hpp
+++ b/ethosu/regor/common/numeric_util.hpp
@@ -376,6 +376,18 @@ constexpr bool IsPowerOfTwo(T x)
     return x > 0 && (x & (x - 1)) == 0;
 }
 
+// Count the number of nonzero nybbles
+inline unsigned NonZeroNybbles(unsigned mask)
+{
+    mask |= mask >> 2;   // technically & with 0xCCCC and 0x2222
+    mask |= mask >> 1;   // but bits don't tavel far enough to leak
+    mask &= 0x11111111;  // =...A...A...A...A...A...A...A...A
+    mask += mask >> 16;  // +...x...x...x...x...B...B...B...B
+    mask += mask >> 8;   // +...x...x...x...x...x...x...C...C
+    mask += mask >> 4;   // +...x...x...x...x...x...x...x...D
+    return mask & 0xF;
+};
+
 template<typename OUT, typename IN>
 OUT ClampToType(IN x)
 {
diff --git a/ethosu/regor/common/shape.hpp b/ethosu/regor/common/shape.hpp
index 3271786c..e6753274 100644
--- a/ethosu/regor/common/shape.hpp
+++ b/ethosu/regor/common/shape.hpp
@@ -264,11 +264,11 @@ public:
     // Compute a product of axes from start to end
     int AxisProduct(int start, int end) const
     {
+        if ( end > Size() ) end = Size();
         if ( start == end ) return 0;
-        int tmp = 1;
-        assert(end <= Size());
         int inner = ToOffset(end - 1);
         int outer = ToOffset(start);
+        int tmp = 1;
         for ( int i = inner; i <= outer; i++ )
         {
             tmp *= At(i);
@@ -509,6 +509,13 @@ public:
 
     unsigned EqualMask(const Shape &other) const { return MinAxisFunc<std::equal_to<int32_t>>(*this, other); }
 
+    unsigned ShapeMask() const
+    {
+        unsigned shift = unsigned(_last);
+        shift = (shift < 32) ? (31u - shift) : 31u;
+        return ~0u >> shift;
+    }
+
     bool IsValid() const { return _last >= 0; }
 
     bool IsDynamic() const { return _dynamic; }
@@ -724,6 +731,7 @@ public:
 
     static Shape PadAxes(const Shape &shape, int axes, int padValue)
     {
+        if ( shape.Size() == axes ) return shape;
         return Shape(shape, std::max(axes, shape.Size()), padValue);
     }
 
@@ -754,20 +762,26 @@ public:
 
     static Shape Wrap(const Shape &a, const Shape &b) { return Shape::MinFunc<op_wrap<int32_t>>(a, b); }
 
+    static Shape GetStridesForShape(const Shape &shape, int elementBytes)
+    {
+        return GetStridesForShape(shape, Shape(elementBytes));
+    }
+
     static Shape GetStridesForShape(const Shape &shape, const Shape &granularity)
     {
-        assert(granularity.Size() >= shape.Size());
         Shape tmp(nullptr, shape.Size());
         if ( shape.IsValid() )
         {
             auto *gran = granularity.Storage();
             auto *from = shape.Storage();
             auto *result = tmp.Storage();
+            int lastGranule = std::min(shape._last, granularity._last);
             result[0] = gran[0];
-            for ( int i = 1; i <= shape._last; i++ )
-            {
+            int i = 1;
+            for ( ; i <= lastGranule; i++ )
                 result[i] = ::RoundAway(result[i - 1] * from[i - 1], gran[i]);
-            }
+            for ( ; i <= shape._last; i++ )
+                result[i] = result[i - 1] * from[i - 1];
         }
         return tmp;
     }
diff --git a/ethosu/regor/compiler/kernel.hpp b/ethosu/regor/compiler/kernel.hpp
index 675b3764..a0b8f8f0 100644
--- a/ethosu/regor/compiler/kernel.hpp
+++ b/ethosu/regor/compiler/kernel.hpp
@@ -158,6 +158,8 @@ public:
         return fmt::format("size={},{} stride={},{}, dilation={},{} padding={}", _size.x, _size.y, _stride.x, _stride.y,
             _dilation.x, _dilation.y, _padding.ToString());
     }
+
+    static Kernel UnitKernel() { return Kernel({1, 1}, {1, 1}, {1, 1}); }
 };
 
 static inline int RequiredInputSize(int value, int stride, int border, int upscale, int rounding = 0)
diff --git a/ethosu/regor/compiler/scheduler.cpp b/ethosu/regor/compiler/scheduler.cpp
index 595e7ee8..5523bd16 100644
--- a/ethosu/regor/compiler/scheduler.cpp
+++ b/ethosu/regor/compiler/scheduler.cpp
@@ -20,6 +20,7 @@
 
 #include "common/logging.hpp"
 
+#include "architecture/architecture_constraints.hpp"
 #include "architecture/weight_encoder.hpp"
 #include "cascade_builder.hpp"
 #include "common/data_type.hpp"
@@ -193,7 +194,7 @@ static bool CheckLinearFormatForConcatSplit(SchedulerTensor *tensor)
 }
 
 
-static int UpdateSchedulerTensor(TensorUsage usage, SchedulerConnection *conn)
+static int UpdateSchedulerTensor(Architecture *arch, TensorUsage usage, SchedulerConnection *conn)
 {
     auto tensor = conn->tensor.get();
 
@@ -222,6 +223,15 @@ static int UpdateSchedulerTensor(TensorUsage usage, SchedulerConnection *conn)
         {
             tensor->needsLinearFormat = true;
         }
+        else if ( producer->Type() == OpType::Transpose )
+        {
+            TransposeSupport supported = arch->Constraints()->SupportsTranspose(OpType::Transpose, producer->OFM()->transpose);
+            if ( supported == TransposeSupport::NHWC )
+            {
+                tensor->needsLinearFormat = true;
+            }
+        }
+
         if ( !producer->IsNpuOp() )
         {
             tensor->hasCPUWriters = true;
@@ -248,11 +258,20 @@ static int UpdateSchedulerTensor(TensorUsage usage, SchedulerConnection *conn)
             continue;
         }
         // Int32 ReduceSum requires linear format
-        if ( consumer->Type() == OpType::ReduceSum && tensor->dataType == DataType::Int32 )
+        else if ( consumer->Type() == OpType::ReduceSum && tensor->dataType == DataType::Int32 )
         {
             tensor->needsLinearFormat = true;
             continue;
         }
+        else if ( consumer->Type() == OpType::Transpose )
+        {
+            TransposeSupport supported = arch->Constraints()->SupportsTranspose(OpType::Transpose, consumer->OFM()->transpose);
+            if ( supported == TransposeSupport::NHWC )
+            {
+                tensor->needsLinearFormat = true;
+            }
+        }
+
         // Check if consumer shape requires linear format
         // Brick format can only be used if both shapes have equal W and C
         // Need to check full shape on connection since tensor might have many producers (concat)
@@ -295,23 +314,26 @@ Address Scheduler::CreateSchedulerRepresentation()
 
         for ( auto pos : schedOp->outputs.pairs() )
         {
-            opMemoryRequired += UpdateSchedulerTensor(pos.first, &pos.second);
+            assert(!pos.second.tensor->producers.empty());
+            opMemoryRequired += UpdateSchedulerTensor(_arch, pos.first, &pos.second);
         }
 
         for ( auto pos : schedOp->inputs.pairs() )
         {
-            opMemoryRequired += UpdateSchedulerTensor(pos.first, &pos.second);
+            assert(!pos.second.tensor->consumers.empty());
+            opMemoryRequired += UpdateSchedulerTensor(_arch, pos.first, &pos.second);
         }
+
         for ( auto const &subOp : schedOp->SubOps() )
         {
             for ( auto pos : subOp->outputs.pairs() )
             {
-                opMemoryRequired += UpdateSchedulerTensor(pos.first, &pos.second);
+                opMemoryRequired += UpdateSchedulerTensor(_arch, pos.first, &pos.second);
             }
 
             for ( auto pos : subOp->inputs.pairs() )
             {
-                opMemoryRequired += UpdateSchedulerTensor(pos.first, &pos.second);
+                opMemoryRequired += UpdateSchedulerTensor(_arch, pos.first, &pos.second);
             }
         }
         minMemoryRequired = std::max(minMemoryRequired, opMemoryRequired);
diff --git a/ethosu/regor/compiler/scheduler_decompose.cpp b/ethosu/regor/compiler/scheduler_decompose.cpp
index 9a6053d1..cd3adf05 100644
--- a/ethosu/regor/compiler/scheduler_decompose.cpp
+++ b/ethosu/regor/compiler/scheduler_decompose.cpp
@@ -220,6 +220,8 @@ bool CanRunOnHardware(Architecture *arch, const SchedulerOperation *schedOp)
         auto &ofmShape = schedOp->OFM()->SliceShape();
         if ( ofmShape.Size() > 3 && ofmShape.Elements() > ofmShape.Width() * ofmShape.Height() * ofmShape.Depth() )
             return false;
+        if ( arch->Constraints()->SupportsTranspose(schedOp->Type(), schedOp->OFM()->transpose) == TransposeSupport::None )
+            return false;
     }
     auto *ifm = schedOp->TryIFM(0);
     auto *ifm2 = schedOp->TryIFM(1);
@@ -1380,7 +1382,8 @@ std::vector<std::unique_ptr<SchedulerOperation>> DecomposeTranspose(Architecture
     const auto axes = ifmShape.Size();
 
     // We can handle all transpositions in a 3D shape
-    if ( axes < 4 || ifmShape.Elements() == ifmShape.Height() * ifmShape.Width() * ifmShape.Depth() )
+    if ( (axes < 4 || ifmShape.Elements() == ifmShape.Height() * ifmShape.Width() * ifmShape.Depth()) &&
+         arch->Constraints()->SupportsTranspose(op->Type(), ofmConn->transpose) != TransposeSupport::None )
     {
         for ( int axis = 0; axis < axes; axis++ )
         {
diff --git a/ethosu/regor/compiler/scheduler_operation.hpp b/ethosu/regor/compiler/scheduler_operation.hpp
index 02777648..1db3d2d7 100644
--- a/ethosu/regor/compiler/scheduler_operation.hpp
+++ b/ethosu/regor/compiler/scheduler_operation.hpp
@@ -334,10 +334,13 @@ public:
         {
             for ( const auto &item : list->pairs() )
             {
-                auto usage = item.first;
                 const auto &connection = item.second;
-                auto &vec = IsOFM(usage) ? connection.tensor->producers : connection.tensor->consumers;
-                vec.erase(std::remove(vec.begin(), vec.end(), this), vec.end());
+                if ( connection.tensor )
+                {
+                    auto usage = item.first;
+                    auto &vec = IsOFM(usage) ? connection.tensor->producers : connection.tensor->consumers;
+                    vec.erase(std::remove(vec.begin(), vec.end(), this), vec.end());
+                }
             }
         }
         inputs.clear();
-- 
GitLab