From 2e0d490c533dde5b08df3a7098db798b8161f48a Mon Sep 17 00:00:00 2001
From: Johan Gunnarsson <johan.gunnarsson@arm.com>
Date: Fri, 17 Jan 2025 15:44:18 +0100
Subject: [PATCH] MLBEDSW-10158: Don't allocate fast memory for cross-CPU
 tensors

A cross-CPU tensor is a tensor with a live range that covers two
command streams, potentially with a CPU op in between them. In a
multi-threaded setup, the fast storage is not guaranteed to be
left unchanged between the execution of the command streams.

This patch can have a negative performance impact for networks
with multiple command streams.

Signed-off-by: Johan Gunnarsson <johan.gunnarsson@arm.com>
Change-Id: I1cc8f51bf5c1b01e212dedc86c9cd2a415648ec0
---
 .../regor/compiler/faststorage_allocator.cpp  |  52 +++++--
 ethosu/regor/test/CMakeLists.txt              |   3 +-
 .../test/test_fast_storage_allocator.cpp      | 146 ++++++++++++++++++
 ethosu/regor/test/util.cpp                    |   5 +-
 ethosu/regor/test/util.hpp                    |   3 +-
 5 files changed, 197 insertions(+), 12 deletions(-)
 create mode 100644 ethosu/regor/test/test_fast_storage_allocator.cpp
diff --git a/ethosu/regor/compiler/faststorage_allocator.cpp b/ethosu/regor/compiler/faststorage_allocator.cpp
index 43e6c67e..c304e3fe 100644
--- a/ethosu/regor/compiler/faststorage_allocator.cpp
+++ b/ethosu/regor/compiler/faststorage_allocator.cpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2021-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -18,6 +18,8 @@
 
 #include "compiler/faststorage_allocator.hpp"
 
+#include "common/logging.hpp"
+
 #include "architecture/architecture.hpp"
 #include "common/vector_span.hpp"
 #include "live_range.hpp"
@@ -139,6 +141,7 @@ void FastStorageAllocator::AllocateFeatureMaps(const std::vector<std::unique_ptr
             if ( !ofm->tensor->consumers.empty() && !ofm->tensor->hasCPUReaders && !ofm->tensor->isGraphOutput &&
                  _scratchedFms.count(ofm->tensor.get()) == 0 && opGroup->NeedsAllocation(ofm->tensor->uid) )
             {
+                LOG_TRACE1("Candidate fast storage tensor: {}\n", ofm->tensor->Name());
                 _scratchedFms[ofm->tensor.get()] = ofm->tensor->memArea;
                 ofm->tensor->memArea = fastStorage;
             }
@@ -148,6 +151,7 @@ void FastStorageAllocator::AllocateFeatureMaps(const std::vector<std::unique_ptr
                 if ( !ofm->tensor->consumers.empty() && !ofm->tensor->hasCPUReaders && !ofm->tensor->isGraphOutput &&
                      _scratchedFms.count(ofm->tensor.get()) == 0 && opGroup->NeedsAllocation(ofm->tensor->uid) )
                 {
+                    LOG_TRACE1("Candidate fast storage tensor: {}\n", ofm->tensor->Name());
                     _scratchedFms[ofm->tensor.get()] = ofm->tensor->memArea;
                     ofm->tensor->memArea = fastStorage;
                 }
@@ -161,13 +165,7 @@ void FastStorageAllocator::AllocateFeatureMaps(const std::vector<std::unique_ptr
     int maxUsage;
     _maxMemUsage = lrGraph.GetTemporalMemoryUsage(maxUsage);
 
-    if ( maxUsage <= _stagingLimit )
-    {
-        // All feature maps fit in fast storage
-        ElementwiseSanitizer(schedOps, schedule, fastStorage, lrGraph);
-        return;
-    }
-    // Not all feature maps fit in fast storage
+    // Collect all live ranges that can potentially be in fast storage
     _baseMemUsage = _maxMemUsage;
     std::vector<LiveRange *> lrs;
     for ( auto lr : lrGraph.LiveRanges() )
@@ -186,9 +184,45 @@ void FastStorageAllocator::AllocateFeatureMaps(const std::vector<std::unique_ptr
         }
     }
 
+    // Collect time indices of all CPU operators
+    std::vector<int> cpuTimeIndices;
+    for ( auto &schedOp : schedOps )
+    {
+        if ( !schedOp->IsNpuOp() )
+        {
+            auto cost = schedule->Cost(schedOp.get());
+            cpuTimeIndices.push_back(cost->timeIndex);
+        }
+    }
+    assert(std::is_sorted(cpuTimeIndices.begin(), cpuTimeIndices.end()));
+
+    // Evict live ranges that cross a CPU operator
+    std::vector<LiveRange *> npuOnlyLrs;
+    for ( auto lr : lrs )
+    {
+        auto cpuTimeIndex = std::lower_bound(cpuTimeIndices.begin(), cpuTimeIndices.end(), lr->startTime);
+        if ( cpuTimeIndex != cpuTimeIndices.end() && *cpuTimeIndex <= lr->endTime )
+        {
+            // Live range crosses CPU operator
+            LOG_TRACE1("Evicting cross-CPU live range {}-{}\n", lr->startTime, lr->endTime);
+            Evict(lr);
+        }
+        else
+        {
+            npuOnlyLrs.push_back(lr);
+        }
+    }
+
+    if ( maxUsage <= _stagingLimit )
+    {
+        // All feature maps fit in fast storage
+        ElementwiseSanitizer(schedOps, schedule, fastStorage, lrGraph);
+        return;
+    }
+
     // Perform a first sweep to keep/evict live ranges that are obviously too big
     std::vector<LiveRange *> canFitLrs;
-    for ( auto lr : lrs )
+    for ( auto lr : npuOnlyLrs )
     {
         // Highest memory usage in this live range
         int baseUsage = *std::max_element(&_baseMemUsage[lr->startTime], &_baseMemUsage[lr->endTime + 1]);
diff --git a/ethosu/regor/test/CMakeLists.txt b/ethosu/regor/test/CMakeLists.txt
index 576b1085..2c679531 100644
--- a/ethosu/regor/test/CMakeLists.txt
+++ b/ethosu/regor/test/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2021, 2023-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -59,6 +59,7 @@ add_catch_test(
         test_scheduler_packing.cpp
         test_operation_utils.cpp
         test_graphir_optimiser.cpp
+        test_fast_storage_allocator.cpp
     DEPS
         test_common
 )
diff --git a/ethosu/regor/test/test_fast_storage_allocator.cpp b/ethosu/regor/test/test_fast_storage_allocator.cpp
new file mode 100644
index 00000000..fe1a2d1f
--- /dev/null
+++ b/ethosu/regor/test/test_fast_storage_allocator.cpp
@@ -0,0 +1,146 @@
+//
+// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common/common.hpp"
+
+#include "architecture/ethosu85/ethos_u85.hpp"
+#include "compiler/faststorage_allocator.hpp"
+#include "compiler/scheduler.hpp"
+#include "util.hpp"
+
+#include <catch_all.hpp>
+#include <memory>
+
+#include "regor.h"
+
+static std::shared_ptr<SchedulerTensor> CreateTensor(std::string name, MemArea memArea)
+{
+    auto schedTensor = CreateSchedulerTensor(name, Shape(10, 10, 10), DataType::Int8);
+    schedTensor->memArea = memArea;
+    return schedTensor;
+}
+
+static std::unique_ptr<SchedulerOperation> CreateSchedulerOperation(std::unique_ptr<Architecture> &arch, bool npu,
+    TensorUsage ifmUsage, std::shared_ptr<SchedulerTensor> &ifm, TensorUsage ofmUsage, std::shared_ptr<SchedulerTensor> &ofm)
+{
+    auto schedOp = CreateSchedulerOperation(OpType::AvgPool, ifmUsage, ifm, ofmUsage, ofm);
+    schedOp->SetNpuOp(npu);
+    if ( npu )
+    {
+        ArchitectureOpGroupQuery query{};
+        query.type = schedOp->Type();
+        query.kernel = schedOp->Kernel();
+        query.inputs = 1;
+        query.ifm[0].key = ifm->uid;
+        query.ifm[0].type = ifm->dataType;
+        query.ifm[0].shape = ifm->storageShape;
+        query.ofm.key = ofm->uid;
+        query.ofm.type = ofm->dataType;
+        query.ofm.shape = ofm->storageShape;
+        query.ofm.transpose = TransposeType::None;
+        query.ofm.reverse = ReverseType::None;
+        auto opGroup = arch->CreateOpGroup(query);
+        assert(opGroup);
+        schedOp->SetOpGroup(std::move(opGroup));
+    }
+    else
+    {
+        ifm->hasCPUReaders = true;
+        ofm->hasCPUWriters = true;
+    }
+    return schedOp;
+}
+
+static std::unique_ptr<Schedule> CreateSchedule(std::unique_ptr<Architecture> &arch, std::vector<std::unique_ptr<SchedulerOperation>> &schedOps)
+{
+    auto schedule = std::make_unique<Schedule>("test");
+    for ( auto &op : schedOps )
+    {
+        auto ifm = op->IFM(0);
+        auto ofm = op->OFM();
+        ArchitectureConfigQuery query{};
+        query.kernel = op->Kernel();
+        query.ifmBits = DataTypeSizeBits(ifm->tensor->dataType);
+        query.ifmShape[0] = ifm->shape;
+        query.ofmShape = ofm->shape;
+        query.transpose = TransposeType::None;
+        query.reverse = ReverseType::None;
+        auto opConfig = arch->GetOpConfig(op->Type(), query);
+        assert(opConfig);
+        auto schedOpInfo = std::make_unique<SchedulerOpInfo>(std::move(opConfig), ifm->shape, Shape(), ofm->shape);
+        schedule->SetCost(*op, std::move(schedOpInfo));
+    }
+    return schedule;
+}
+
+TEST_CASE("test_fast_storage_allocator")
+{
+    // Create arch
+    auto arch = CreateArchDefault<ArchEthosU85>();
+    std::string err = "noerror";
+    arch->CheckConfiguration(err);
+    REQUIRE(err == "noerror");
+
+    // Create some memories
+    const MemArea fast = arch->StagingMemory();
+    const MemArea notFast = arch->FeatureMapMemory();
+
+    // Create some tensors
+    auto tens1 = CreateTensor("t1", notFast);
+    auto tens2 = CreateTensor("t2", notFast);
+    auto tens3 = CreateTensor("t3", notFast);
+    auto tens4 = CreateTensor("t4", notFast);
+    auto tens5 = CreateTensor("t5", notFast);
+    auto tens6 = CreateTensor("t6", notFast);
+
+    SECTION("Sequential network")
+    {
+        std::vector<std::unique_ptr<SchedulerOperation>> ops;
+        ops.push_back(CreateSchedulerOperation(arch, true, TensorUsage::IFM, tens1, TensorUsage::OFM, tens2));
+        ops.push_back(CreateSchedulerOperation(arch, true, TensorUsage::IFM, tens2, TensorUsage::OFM, tens3));
+
+        auto schedule = CreateSchedule(arch, ops);
+        FastStorageAllocator allocator;
+        allocator.AllocateFeatureMaps(ops, schedule.get(), fast, 32 * 1024);
+
+        REQUIRE(tens1->memArea != fast);  // Because no producers
+        REQUIRE(tens2->memArea == fast);
+        REQUIRE(tens3->memArea != fast);  // Because no consumers
+    }
+
+    SECTION("Mixed NPU/CPU network with a live range covering CPU operation")
+    {
+        std::vector<std::unique_ptr<SchedulerOperation>> ops;
+        ops.push_back(CreateSchedulerOperation(arch, true, TensorUsage::IFM, tens1, TensorUsage::OFM, tens2));
+        ops.push_back(CreateSchedulerOperation(arch, true, TensorUsage::IFM, tens2, TensorUsage::OFM, tens3));
+        ops.push_back(CreateSchedulerOperation(arch, false, TensorUsage::IFM, tens2, TensorUsage::OFM, tens4));
+        ops.push_back(CreateSchedulerOperation(arch, true, TensorUsage::IFM, tens3, TensorUsage::OFM, tens5));
+        ops.push_back(CreateSchedulerOperation(arch, true, TensorUsage::IFM, tens4, TensorUsage::OFM, tens6));
+
+        auto schedule = CreateSchedule(arch, ops);
+        FastStorageAllocator allocator;
+        allocator.AllocateFeatureMaps(ops, schedule.get(), fast, 32 * 1024);
+
+        REQUIRE(tens1->memArea != fast);  // Because no producers
+        REQUIRE(tens2->memArea != fast);  // Because CPU readers
+        REQUIRE(tens3->memArea != fast);  // Because live range covers CPU operation
+        REQUIRE(tens4->memArea != fast);  // Because CPU writers
+        REQUIRE(tens5->memArea != fast);  // Because no consumers
+        REQUIRE(tens6->memArea != fast);  // Because no consumers
+    }
+}
diff --git a/ethosu/regor/test/util.cpp b/ethosu/regor/test/util.cpp
index 617f495c..daedb4f8 100644
--- a/ethosu/regor/test/util.cpp
+++ b/ethosu/regor/test/util.cpp
@@ -21,6 +21,7 @@
 #include "common/data_type.hpp"
 #include "common/ini_reader.hpp"
 
+#include <memory>
 #include <mutex>
 #include <thread>
 
@@ -59,7 +60,7 @@ std::string TestConfig(int macs)
     // System configuration
     config += "[system]\n";
     config += "const=flash\n";
-    config += "feature_maps=sram\n";
+    config += "feature_maps=dram\n";
     config += "staging=sram\n";
     return config;
 }
@@ -157,6 +158,7 @@ std::shared_ptr<Operation> CreateOperation(OpType opType, TensorUsage ifmUsage,
     TensorUsage ofmUsage, std::shared_ptr<Tensor> &ofm)
 {
     auto op = std::make_shared<Operation>(opType);
+    op->SetKernel(std::make_unique<Kernel>(Kernel::UnitKernel()));
     op->ConnectInput(ifmUsage, ifm);
     op->ConnectOutput(ofmUsage, ofm);
     return op;
@@ -219,6 +221,7 @@ std::unique_ptr<SchedulerOperation> CreateSchedulerOperation(OpType opType, Tens
     s_ops.add_op(op);
 
     auto schedOp = std::make_unique<SchedulerOperation>(opType);
+    schedOp->SetKernel(op->Kernel());
     schedOp->_srcKey = static_cast<void *>(op.get());
     // ifm
     auto *ifmConn = schedOp->AddInput(ifmUsage);
diff --git a/ethosu/regor/test/util.hpp b/ethosu/regor/test/util.hpp
index 240d4dd9..854740bb 100644
--- a/ethosu/regor/test/util.hpp
+++ b/ethosu/regor/test/util.hpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -20,6 +20,7 @@
 
 #include "architecture/architecture.hpp"
 #include "compiler/graph.hpp"
+#include "compiler/scheduler.hpp"
 #include "compiler/scheduler_operation.hpp"
 
 using namespace regor;
-- 
GitLab