From 2e0d490c533dde5b08df3a7098db798b8161f48a Mon Sep 17 00:00:00 2001 From: Johan Gunnarsson Date: Fri, 17 Jan 2025 15:44:18 +0100 Subject: [PATCH] MLBEDSW-10158: Don't allocate fast memory for cross-CPU tensors A cross-CPU tensor is a tensor with a live range that covers two command streams, potentially with a CPU op in between them. In a multi-threaded setup, the fast storage is not guaranteed to be left unchanged between the execution of the command streams. This patch can have a negative performance impact for networks with multiple command streams. Signed-off-by: Johan Gunnarsson Change-Id: I1cc8f51bf5c1b01e212dedc86c9cd2a415648ec0 --- .../regor/compiler/faststorage_allocator.cpp | 52 +++++-- ethosu/regor/test/CMakeLists.txt | 3 +- .../test/test_fast_storage_allocator.cpp | 146 ++++++++++++++++++ ethosu/regor/test/util.cpp | 5 +- ethosu/regor/test/util.hpp | 3 +- 5 files changed, 197 insertions(+), 12 deletions(-) create mode 100644 ethosu/regor/test/test_fast_storage_allocator.cpp diff --git a/ethosu/regor/compiler/faststorage_allocator.cpp b/ethosu/regor/compiler/faststorage_allocator.cpp index 43e6c67e..c304e3fe 100644 --- a/ethosu/regor/compiler/faststorage_allocator.cpp +++ b/ethosu/regor/compiler/faststorage_allocator.cpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2021-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -18,6 +18,8 @@ #include "compiler/faststorage_allocator.hpp" +#include "common/logging.hpp" + #include "architecture/architecture.hpp" #include "common/vector_span.hpp" #include "live_range.hpp" @@ -139,6 +141,7 @@ void FastStorageAllocator::AllocateFeatureMaps(const std::vectortensor->consumers.empty() && !ofm->tensor->hasCPUReaders && !ofm->tensor->isGraphOutput && _scratchedFms.count(ofm->tensor.get()) == 0 && opGroup->NeedsAllocation(ofm->tensor->uid) ) { + LOG_TRACE1("Candidate fast storage tensor: {}\n", ofm->tensor->Name()); _scratchedFms[ofm->tensor.get()] = ofm->tensor->memArea; ofm->tensor->memArea = fastStorage; } @@ -148,6 +151,7 @@ void FastStorageAllocator::AllocateFeatureMaps(const std::vectortensor->consumers.empty() && !ofm->tensor->hasCPUReaders && !ofm->tensor->isGraphOutput && _scratchedFms.count(ofm->tensor.get()) == 0 && opGroup->NeedsAllocation(ofm->tensor->uid) ) { + LOG_TRACE1("Candidate fast storage tensor: {}\n", ofm->tensor->Name()); _scratchedFms[ofm->tensor.get()] = ofm->tensor->memArea; ofm->tensor->memArea = fastStorage; } @@ -161,13 +165,7 @@ void FastStorageAllocator::AllocateFeatureMaps(const std::vector lrs; for ( auto lr : lrGraph.LiveRanges() ) @@ -186,9 +184,45 @@ void FastStorageAllocator::AllocateFeatureMaps(const std::vector cpuTimeIndices; + for ( auto &schedOp : schedOps ) + { + if ( !schedOp->IsNpuOp() ) + { + auto cost = schedule->Cost(schedOp.get()); + cpuTimeIndices.push_back(cost->timeIndex); + } + } + assert(std::is_sorted(cpuTimeIndices.begin(), cpuTimeIndices.end())); + + // Evict live ranges that cross a CPU operator + std::vector npuOnlyLrs; + for ( auto lr : lrs ) + { + auto cpuTimeIndex = std::lower_bound(cpuTimeIndices.begin(), cpuTimeIndices.end(), lr->startTime); + if ( cpuTimeIndex != cpuTimeIndices.end() && *cpuTimeIndex <= lr->endTime ) + { + // Live range crosses CPU operator + LOG_TRACE1("Evicting cross-CPU live range {}-{}\n", lr->startTime, lr->endTime); + Evict(lr); + } + else + { + npuOnlyLrs.push_back(lr); + } + } + + if ( maxUsage <= _stagingLimit ) + { + // All feature maps fit in fast storage + ElementwiseSanitizer(schedOps, schedule, fastStorage, lrGraph); + return; + } + // Perform a first sweep to keep/evict live ranges that are obviously too big std::vector canFitLrs; - for ( auto lr : lrs ) + for ( auto lr : npuOnlyLrs ) { // Highest memory usage in this live range int baseUsage = *std::max_element(&_baseMemUsage[lr->startTime], &_baseMemUsage[lr->endTime + 1]); diff --git a/ethosu/regor/test/CMakeLists.txt b/ethosu/regor/test/CMakeLists.txt index 576b1085..2c679531 100644 --- a/ethosu/regor/test/CMakeLists.txt +++ b/ethosu/regor/test/CMakeLists.txt @@ -1,5 +1,5 @@ -# SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2021, 2023-2025 Arm Limited and/or its affiliates # # SPDX-License-Identifier: Apache-2.0 # @@ -59,6 +59,7 @@ add_catch_test( test_scheduler_packing.cpp test_operation_utils.cpp test_graphir_optimiser.cpp + test_fast_storage_allocator.cpp DEPS test_common ) diff --git a/ethosu/regor/test/test_fast_storage_allocator.cpp b/ethosu/regor/test/test_fast_storage_allocator.cpp new file mode 100644 index 00000000..fe1a2d1f --- /dev/null +++ b/ethosu/regor/test/test_fast_storage_allocator.cpp @@ -0,0 +1,146 @@ +// +// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common/common.hpp" + +#include "architecture/ethosu85/ethos_u85.hpp" +#include "compiler/faststorage_allocator.hpp" +#include "compiler/scheduler.hpp" +#include "util.hpp" + +#include +#include + +#include "regor.h" + +static std::shared_ptr CreateTensor(std::string name, MemArea memArea) +{ + auto schedTensor = CreateSchedulerTensor(name, Shape(10, 10, 10), DataType::Int8); + schedTensor->memArea = memArea; + return schedTensor; +} + +static std::unique_ptr CreateSchedulerOperation(std::unique_ptr &arch, bool npu, + TensorUsage ifmUsage, std::shared_ptr &ifm, TensorUsage ofmUsage, std::shared_ptr &ofm) +{ + auto schedOp = CreateSchedulerOperation(OpType::AvgPool, ifmUsage, ifm, ofmUsage, ofm); + schedOp->SetNpuOp(npu); + if ( npu ) + { + ArchitectureOpGroupQuery query{}; + query.type = schedOp->Type(); + query.kernel = schedOp->Kernel(); + query.inputs = 1; + query.ifm[0].key = ifm->uid; + query.ifm[0].type = ifm->dataType; + query.ifm[0].shape = ifm->storageShape; + query.ofm.key = ofm->uid; + query.ofm.type = ofm->dataType; + query.ofm.shape = ofm->storageShape; + query.ofm.transpose = TransposeType::None; + query.ofm.reverse = ReverseType::None; + auto opGroup = arch->CreateOpGroup(query); + assert(opGroup); + schedOp->SetOpGroup(std::move(opGroup)); + } + else + { + ifm->hasCPUReaders = true; + ofm->hasCPUWriters = true; + } + return schedOp; +} + +static std::unique_ptr CreateSchedule(std::unique_ptr &arch, std::vector> &schedOps) +{ + auto schedule = std::make_unique("test"); + for ( auto &op : schedOps ) + { + auto ifm = op->IFM(0); + auto ofm = op->OFM(); + ArchitectureConfigQuery query{}; + query.kernel = op->Kernel(); + query.ifmBits = DataTypeSizeBits(ifm->tensor->dataType); + query.ifmShape[0] = ifm->shape; + query.ofmShape = ofm->shape; + query.transpose = TransposeType::None; + query.reverse = ReverseType::None; + auto opConfig = arch->GetOpConfig(op->Type(), query); + assert(opConfig); + auto schedOpInfo = std::make_unique(std::move(opConfig), ifm->shape, Shape(), ofm->shape); + schedule->SetCost(*op, std::move(schedOpInfo)); + } + return schedule; +} + +TEST_CASE("test_fast_storage_allocator") +{ + // Create arch + auto arch = CreateArchDefault(); + std::string err = "noerror"; + arch->CheckConfiguration(err); + REQUIRE(err == "noerror"); + + // Create some memories + const MemArea fast = arch->StagingMemory(); + const MemArea notFast = arch->FeatureMapMemory(); + + // Create some tensors + auto tens1 = CreateTensor("t1", notFast); + auto tens2 = CreateTensor("t2", notFast); + auto tens3 = CreateTensor("t3", notFast); + auto tens4 = CreateTensor("t4", notFast); + auto tens5 = CreateTensor("t5", notFast); + auto tens6 = CreateTensor("t6", notFast); + + SECTION("Sequential network") + { + std::vector> ops; + ops.push_back(CreateSchedulerOperation(arch, true, TensorUsage::IFM, tens1, TensorUsage::OFM, tens2)); + ops.push_back(CreateSchedulerOperation(arch, true, TensorUsage::IFM, tens2, TensorUsage::OFM, tens3)); + + auto schedule = CreateSchedule(arch, ops); + FastStorageAllocator allocator; + allocator.AllocateFeatureMaps(ops, schedule.get(), fast, 32 * 1024); + + REQUIRE(tens1->memArea != fast); // Because no producers + REQUIRE(tens2->memArea == fast); + REQUIRE(tens3->memArea != fast); // Because no consumers + } + + SECTION("Mixed NPU/CPU network with a live range covering CPU operation") + { + std::vector> ops; + ops.push_back(CreateSchedulerOperation(arch, true, TensorUsage::IFM, tens1, TensorUsage::OFM, tens2)); + ops.push_back(CreateSchedulerOperation(arch, true, TensorUsage::IFM, tens2, TensorUsage::OFM, tens3)); + ops.push_back(CreateSchedulerOperation(arch, false, TensorUsage::IFM, tens2, TensorUsage::OFM, tens4)); + ops.push_back(CreateSchedulerOperation(arch, true, TensorUsage::IFM, tens3, TensorUsage::OFM, tens5)); + ops.push_back(CreateSchedulerOperation(arch, true, TensorUsage::IFM, tens4, TensorUsage::OFM, tens6)); + + auto schedule = CreateSchedule(arch, ops); + FastStorageAllocator allocator; + allocator.AllocateFeatureMaps(ops, schedule.get(), fast, 32 * 1024); + + REQUIRE(tens1->memArea != fast); // Because no producers + REQUIRE(tens2->memArea != fast); // Because CPU readers + REQUIRE(tens3->memArea != fast); // Because live range covers CPU operation + REQUIRE(tens4->memArea != fast); // Because CPU writers + REQUIRE(tens5->memArea != fast); // Because no consumers + REQUIRE(tens6->memArea != fast); // Because no consumers + } +} diff --git a/ethosu/regor/test/util.cpp b/ethosu/regor/test/util.cpp index 617f495c..daedb4f8 100644 --- a/ethosu/regor/test/util.cpp +++ b/ethosu/regor/test/util.cpp @@ -21,6 +21,7 @@ #include "common/data_type.hpp" #include "common/ini_reader.hpp" +#include #include #include @@ -59,7 +60,7 @@ std::string TestConfig(int macs) // System configuration config += "[system]\n"; config += "const=flash\n"; - config += "feature_maps=sram\n"; + config += "feature_maps=dram\n"; config += "staging=sram\n"; return config; } @@ -157,6 +158,7 @@ std::shared_ptr CreateOperation(OpType opType, TensorUsage ifmUsage, TensorUsage ofmUsage, std::shared_ptr &ofm) { auto op = std::make_shared(opType); + op->SetKernel(std::make_unique(Kernel::UnitKernel())); op->ConnectInput(ifmUsage, ifm); op->ConnectOutput(ofmUsage, ofm); return op; @@ -219,6 +221,7 @@ std::unique_ptr CreateSchedulerOperation(OpType opType, Tens s_ops.add_op(op); auto schedOp = std::make_unique(opType); + schedOp->SetKernel(op->Kernel()); schedOp->_srcKey = static_cast(op.get()); // ifm auto *ifmConn = schedOp->AddInput(ifmUsage); diff --git a/ethosu/regor/test/util.hpp b/ethosu/regor/test/util.hpp index 240d4dd9..854740bb 100644 --- a/ethosu/regor/test/util.hpp +++ b/ethosu/regor/test/util.hpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -20,6 +20,7 @@ #include "architecture/architecture.hpp" #include "compiler/graph.hpp" +#include "compiler/scheduler.hpp" #include "compiler/scheduler_operation.hpp" using namespace regor; -- GitLab