From cbcae5b8aba1b60d94af0717923e78774d08944c Mon Sep 17 00:00:00 2001
From: Philip Hall <philip.hall@arm.com>
Date: Fri, 31 Jan 2025 10:56:20 +0000
Subject: [PATCH] MLBEDSW-10106: Improve HLCS featuremap mapping mechanism.

HLCS generation uses a positional equivalence between
tensor index and tensor usage to map ifms from Schedule
IR to HLCS. This doesn't scale and required looping over
a fixed number of IFMs.

 - Switch from positional to tensor-usage based IFM
   mappings.
 - Allow potentially unlimited input IFMs for HLCS
   operators.
 - Improved field layout of HLCFeaturemap to reduce
   memory wasted by padding.

Signed-off-by: Philip Hall <philip.hall@arm.com>
Change-Id: Iea517eb8994e70ba3d8c0618dfd487706e785b60
---
 .../compiler/high_level_command_stream.hpp    | 19 +++++-----
 .../high_level_command_stream_generator.cpp   | 38 +++++++++----------
 ethosu/regor/compiler/tensor_properties.hpp   |  2 -
 3 files changed, 29 insertions(+), 30 deletions(-)
diff --git a/ethosu/regor/compiler/high_level_command_stream.hpp b/ethosu/regor/compiler/high_level_command_stream.hpp
index 9f248def..648509fa 100644
--- a/ethosu/regor/compiler/high_level_command_stream.hpp
+++ b/ethosu/regor/compiler/high_level_command_stream.hpp
@@ -65,21 +65,22 @@ struct HLCPadding
 /// </summary>
 struct HLCFeatureMap
 {
+    TensorUsage usage = TensorUsage::None;
+    DataType dataType = DataType::None;
     TensorFormat format = TensorFormat::Unknown;
-    MemArea memArea;
-    Shape shape;
+    TransposeType transpose = TransposeType::None;
+    ReverseType reverse = ReverseType::None;
     TensorSlice slice;
+    Shape shape;
     Shape strides;
-    Point2i stepXY = {1, 1};
-    DataType dataType;
-    Address address = -1;
+    MemArea memArea;
     BufferView bufferView;
     Quantization quantization;
-    ArchResampling resamplingMode = ArchResampling::None;
-    TransposeType transpose = TransposeType::None;
-    ReverseType reverse = ReverseType::None;
-    HLCRoundMode rounding;
+    Point2i stepXY = {1, 1};
+    Address address = -1;
     UniqueId uid = ~0u;
+    HLCRoundMode rounding = HLCRoundMode::AUTO;
+    ArchResampling resamplingMode = ArchResampling::None;
 
     int AllocationSizeBytes() const { return TensorAllocationBytes(shape, format, dataType); }
 
diff --git a/ethosu/regor/compiler/high_level_command_stream_generator.cpp b/ethosu/regor/compiler/high_level_command_stream_generator.cpp
index b760a420..9b9b5863 100644
--- a/ethosu/regor/compiler/high_level_command_stream_generator.cpp
+++ b/ethosu/regor/compiler/high_level_command_stream_generator.cpp
@@ -233,7 +233,7 @@ static Shape GetStrides(const HLCFeatureMap &fm)
     }
 }
 
-static void MakeFeatureMap(const SchedulerConnection *schedConn, HLCFeatureMap &fm)
+static void MakeFeatureMap(TensorUsage usage, const SchedulerConnection *schedConn, HLCFeatureMap &fm)
 {
     auto schedTens = schedConn->tensor.get();
     fm.shape = schedConn->shape;
@@ -241,6 +241,7 @@ static void MakeFeatureMap(const SchedulerConnection *schedConn, HLCFeatureMap &
     fm.dataType = schedTens->dataType;
     fm.memArea = schedTens->memArea;
     fm.format = schedTens->format;
+    fm.usage = usage;
     fm.address = schedTens->AllocatedAddress();
     fm.quantization = schedConn->quantization;
     fm.bufferView = schedTens->bufferView;
@@ -281,16 +282,16 @@ static HLCSubOperation MakeSubOperation(const std::unique_ptr<SchedulerOperation
     hlcSubOp.type = schedOp->Type();
     auto lutConn = schedOp->TryInput(TensorUsage::LUT);
 
-    for ( int i = 0; i < MAX_NUM_IFM; ++i )
+    for ( const auto &input : schedOp->inputs.pairs() )
     {
-        auto ifm = schedOp->TryIFM(i);
-        if ( ifm != nullptr )
+        if ( IsIFM(input.first) )
         {
+            assert(((input.first == TensorUsage::IFM0) && hlcSubOp.ifm.empty()) || !hlcSubOp.ifm.empty());
             hlcSubOp.ifm.emplace_back();
-            MakeFeatureMap(ifm, hlcSubOp.ifm.back());
+            MakeFeatureMap(input.first, &input.second, hlcSubOp.ifm.back());
         }
     }
-    MakeFeatureMap(schedOp->OFM(), hlcSubOp.ofm);
+    MakeFeatureMap(TensorUsage::OFM, schedOp->OFM(), hlcSubOp.ofm);
     hlcSubOp._srcId = schedOp->Uid();
 
     if ( schedOp->Type() == OpType::LeakyRelu )
@@ -319,17 +320,16 @@ static std::shared_ptr<HLCOperation> MakeOperation(SchedulerOperation *schedOp,
     op->config = opInfo->Config();
     op->_srcId = schedOp->Uid();
 
-    for ( int i = 0; i < MAX_NUM_IFM; ++i )
+    for ( const auto &input : schedOp->inputs.pairs() )
     {
-        auto ifm = schedOp->TryIFM(i);
-        if ( ifm != nullptr )
+        if ( IsIFM(input.first) )
         {
-            HLCFeatureMap fm;
-            MakeFeatureMap(ifm, fm);
-            op->ifm.push_back(fm);
+            assert(((input.first == TensorUsage::IFM0) && op->ifm.empty()) || !op->ifm.empty());  // map not in order
+            op->ifm.emplace_back();
+            MakeFeatureMap(input.first, &input.second, op->ifm.back());
         }
     }
-    MakeFeatureMap(schedOp->OFM(), op->ofm);
+    MakeFeatureMap(TensorUsage::OFM, schedOp->OFM(), op->ofm);
 #ifndef NDEBUG
     op->name = schedOp->OFM()->tensor->Name();
 #endif
@@ -633,7 +633,6 @@ void HLCStreamGenerator::GenerateHLCStripeCommands(SchedulerOperation *op, const
     }
     assert(ofmStart.Size() >= 4);
     assert(ofmEnd.Size() >= 4);
-    assert(hlcOp->ifm.size() <= 2);
 
     // Binary elementwise using broadcast to repeat smaller IFMs over larger IFM volumes need their
     // coordinates to wrap at the limits of the smaller IFM volume.
@@ -660,9 +659,10 @@ void HLCStreamGenerator::GenerateHLCStripeCommands(SchedulerOperation *op, const
                 hlcStripe->padding = padding;
                 hlcStripe->ofmArea = outputArea;
                 hlcStripe->opGroup = opGroup;
-                for ( unsigned ifmIndex = 0; ifmIndex < hlcOp->ifm.size(); ++ifmIndex )
+                for ( const auto &fm : hlcOp->ifm )
                 {
-                    auto ifmConn = op->IFM(ifmIndex);
+                    if ( !IsIFM(fm.usage) ) continue;
+                    auto ifmConn = op->Input(fm.usage);
                     // Calculate input area based on the output area
                     auto inputArea = TransformWithStridesAndSkirt(outputArea, &strides, ifmConn->stepXY, &skirt, ifmConn->shape,
                         opType, ofmConn->slice.offset, ifmConn->slice.offset, ifmConn->slice.shape, dilatedKernelHeight,
@@ -880,14 +880,14 @@ void HLCStreamGenerator::PrintCommandStream(const NPUOperation *npuOp, std::vect
         auto op = schedOp.get();
         const auto hlcOp = hlcOps[opIndex].get();
         LOG_PRINT("{} {}\n", opIndex, hlcOp->ToString());
-        LOG_PRINT("  IFM: {}, {}\n", op->IFM(0)->tensor->Name(), hlcOp->ifm[0].ToString());
+        LOG_PRINT("  IFM: {}, {}\n", op->Input(hlcOp->ifm[0].usage)->tensor->Name(), hlcOp->ifm[0].ToString());
         if ( hlcOp->ifm.size() > 1 )
         {
-            LOG_PRINT("  IFM2: {}, {}\n", op->IFM(1)->tensor->Name(), hlcOp->ifm[1].ToString());
+            LOG_PRINT("  IFM2: {}, {}\n", op->Input(hlcOp->ifm[1].usage)->tensor->Name(), hlcOp->ifm[1].ToString());
         }
         if ( hlcOp->ifm.size() > 2 )
         {
-            LOG_PRINT("  IFM3: {}, {}\n", op->IFM(2)->tensor->Name(), hlcOp->ifm[2].ToString());
+            LOG_PRINT("  IFM3: {}, {}\n", op->Input(hlcOp->ifm[2].usage)->tensor->Name(), hlcOp->ifm[2].ToString());
         }
         LOG_PRINT("  OFM: {}, {}\n", op->OFM()->tensor->Name(), hlcOp->ofm.ToString());
         if ( hlcOp->weights != nullptr )
diff --git a/ethosu/regor/compiler/tensor_properties.hpp b/ethosu/regor/compiler/tensor_properties.hpp
index 2ab44e5c..ac206e92 100644
--- a/ethosu/regor/compiler/tensor_properties.hpp
+++ b/ethosu/regor/compiler/tensor_properties.hpp
@@ -63,8 +63,6 @@ enum class TensorUsage : uint32_t
 
 DECLARE_ENUM_AS_FLAGS(TensorUsage)
 
-constexpr int MAX_NUM_IFM = 3;
-
 constexpr inline bool IsOFM(TensorUsage usage)
 {
     return (usage & TensorUsage::TypeMask) == TensorUsage::OFM;
-- 
GitLab