diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp index cfbcd53abf043040f504472a208195a054eb95ec..26eec5c1f2c129b42abe5ac514648df1f95b46e8 100644 --- a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp @@ -497,11 +497,14 @@ int EthosU55RCSGenerator::CalcCommandWaits(const MemoryAccesses &opAccesses, std // Returns LUT slot to be used for the given LUT operation. // Sets alreadyInLutMem to true if the LUT is already in SHRAM. -int EthosU55RCSGenerator::AllocateLutSlot( - std::vector &lutSlots, const HLCOperation *op, int sizeInSlots, int timestamp, bool &alreadyInLutMem) +int EthosU55RCSGenerator::AllocateLutSlot(const MemArea &memArea, Address address, int lutSize, int timestamp, bool &alreadyInLutMem) { alreadyInLutMem = false; - int totalSlots = int(lutSlots.size()); + int lutSlotSize = _arch->_shram.lutSlotSize; + assert(lutSize % lutSlotSize == 0); + + int sizeInSlots = lutSize / lutSlotSize; + int totalSlots = int(_lutSlots.size()); if ( sizeInSlots < 0 || sizeInSlots > totalSlots ) { assert(false); @@ -511,22 +514,25 @@ int EthosU55RCSGenerator::AllocateLutSlot( int allocatedSlot = 0; for ( int i = 0; i < totalSlots; i += sizeInSlots ) { - if ( lutSlots[i].hlcOp == op ) + if ( _lutSlots[i].memory == memArea.memory && _lutSlots[i].address == address && _lutSlots[i].sizeBytes == lutSize ) { // LUT is already in SHRAM allocatedSlot = i; alreadyInLutMem = true; break; } - if ( lutSlots[i].lastUsed < lutSlots[allocatedSlot].lastUsed ) + assert(allocatedSlot < totalSlots); + if ( _lutSlots[i].lastUsed < _lutSlots[allocatedSlot].lastUsed ) { allocatedSlot = i; } } for ( int j = allocatedSlot; j < allocatedSlot + sizeInSlots; ++j ) { - lutSlots[j].hlcOp = op; - lutSlots[j].lastUsed = timestamp; + _lutSlots[j].memory = memArea.memory; + _lutSlots[j].address = address; + _lutSlots[j].sizeBytes = lutSize; + _lutSlots[j].lastUsed = timestamp; } return allocatedSlot; } @@ -1225,20 +1231,18 @@ void EthosU55RCSGenerator::InsertLUTDMACommand( assert(op->type == OpType::LUT || (!op->subOps.empty() && op->subOps[0].type == OpType::LUT)); - const auto &srcTens = op->type == OpType::LUT ? op->parameters.lut : op->subOps[0].parameters.lut; + const auto &lutTens = op->type == OpType::LUT ? op->parameters.lut : op->subOps[0].parameters.lut; assert(config->_layout.lutStart > 0); - assert(srcTens.sizeBytes % lutSlotSize == 0); bool alreadyInLutMem; - int sizeInSlots = srcTens.sizeBytes / lutSlotSize; - int slot = AllocateLutSlot(_lutSlots, op.get(), sizeInSlots, index, alreadyInLutMem); + int slot = AllocateLutSlot(lutTens.memArea, lutTens.address, lutTens.sizeBytes, index, alreadyInLutMem); _stripeToLutSlot[stripe] = slot; if ( !alreadyInLutMem ) { auto dma = std::make_unique(); - dma->srcMemArea = srcTens.memArea; - dma->srcAddress = srcTens.address; - dma->length = srcTens.sizeBytes; + dma->srcMemArea = lutTens.memArea; + dma->srcAddress = lutTens.address; + dma->length = lutTens.sizeBytes; dma->destMemArea = _arch->LUTMemory(); dma->destAddress = _arch->_shram.bankSizeBytes * config->_layout.lutStart + slot * lutSlotSize; emitted.push_back(dma.get()); @@ -1940,8 +1944,7 @@ void EthosU55RCSGenerator::PrepareCommand(int index, HighLevelCommand *cmd, Temp // LUT is overwritten by SHRAM accumulator buffers; clear slots for ( auto &slot : _lutSlots ) { - slot.hlcOp = nullptr; - slot.lastUsed = 0; + slot = {}; } } } @@ -1959,8 +1962,7 @@ std::vector EthosU55RCSGenerator::GenerateCommandStream( // Clear lut slots at start of command stream generation for ( auto &slot : _lutSlots ) { - slot.hlcOp = nullptr; - slot.lastUsed = 0; + slot = {}; } GenerateInitialRegisterSetup(); @@ -2011,7 +2013,7 @@ std::vector EthosU55RCSGenerator::GenerateCommandStream( // Return command mapping information to the caller if ( cmdRanges && cmd->IsStripe() ) { - cmdRanges->emplace_back(static_cast(cmd.get())->operation->_srcId, emitStart, _emit.Position()); + cmdRanges->emplace_back(static_cast(cmd.get())->operation->srcId, emitStart, _emit.Position()); } cmdIndex++; } diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.hpp b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.hpp index 233b90ba316c89e9c84ab89f0cc5afbfb1c350ee..662ce49a148eed65e8ce8c17d45e9651f8b2028c 100644 --- a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.hpp +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.hpp @@ -111,7 +111,9 @@ using MemoryAccesses = std::vector; struct LutSlot { - const HLCOperation *hlcOp = nullptr; + const ArchitectureMemory *memory = nullptr; + Address address = -1; + int sizeBytes = -1; int lastUsed = 0; }; @@ -164,7 +166,7 @@ protected: static int CalcCommandWaits(const MemoryAccesses &opAccesses, std::deque &outstanding); // Returns LUT slot to be used for the given LUT operation. // Sets alreadyInLutMem to true if the LUT is already in SHRAM. - int AllocateLutSlot(std::vector &lutSlots, const HLCOperation *op, int sizeInSlots, int timestamp, bool &alreadyInLutMem); + int AllocateLutSlot(const MemArea &memArea, Address address, int lutSize, int timestamp, bool &alreadyInLutMem); //---------------------------------------------------------------------- // Scaling (OFM/OPA/OPB_SCALE) //---------------------------------------------------------------------- diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85.cpp index bf13380a03b1292fd53c63d70053a7b51e69de45..c070b5297c65e483a4eb2d16e06b097765ed85f9 100644 --- a/ethosu/regor/architecture/ethosu85/ethos_u85.cpp +++ b/ethosu/regor/architecture/ethosu85/ethos_u85.cpp @@ -1421,19 +1421,6 @@ bool EthosU85OpGroup::Fuse(const ArchitectureOpGroupQuery &op, const std::vector return false; } - if ( _chainLength > 1 ) - { - // TODO MLBEDSW-9142: support fusing on chained ops - return false; - } - - // activation fusing.. - if ( op.ifm[0].type == DataType::Int16 && (op.type == OpType::Sigmoid || op.type == OpType::Tanh) ) - { - // Can not fuse int16 Sigmoid and Tanh LUT since they require special scaling done by AvgPoolNop - return false; - } - if ( dependsOn.size() > 1 ) { // Can only fuse with one op @@ -1454,8 +1441,8 @@ bool EthosU85OpGroup::Fuse(const ArchitectureOpGroupQuery &op, const std::vector } const EthosU85OpGroup::OpInfo &prevOp = _ops[dep]; - // Can't fuse activation with activation - if ( IsActivation(op.type) && _hasFusedActivation ) + // Can't fuse two consecutive activations + if ( IsActivation(op.type) && IsActivation(prevOp.type) ) { return false; } @@ -1489,7 +1476,6 @@ bool EthosU85OpGroup::Fuse(const ArchitectureOpGroupQuery &op, const std::vector return false; } - _hasFusedActivation = _hasFusedActivation || IsActivation(op.type); _hasFusedTranspose = _hasFusedTranspose || (op.type == OpType::Transpose && !IsNone(op.ofm.transpose)); _hasFusedReverse = _hasFusedReverse || (op.type == OpType::Reverse && op.ofm.reverse != ReverseType::None); @@ -1512,11 +1498,6 @@ bool EthosU85OpGroup::Chain(const ArchitectureOpGroupQuery &op, const std::vecto // can only consider external (non-constant) inputs for chaining return false; } - if ( _opsCount > _chainLength ) - { - // TODO MLBEDSW-9142: support chaining on fused ops - return false; - } if ( npuOp != EthosU85NpuOp::Elementwise ) { return false; @@ -1583,7 +1564,6 @@ int EthosU85OpGroup::Add(const ArchitectureOpGroupQuery &op, const std::vector &lutSlots, const HLCOperation *op, int sizeInSlots, int timestamp, bool &alreadyInLutMem) +int EthosU85RCSGenerator::AllocateLutSlot(std::vector &lutSlots, const MemArea &memArea, Address address, + int lutSize, int timestamp, bool &alreadyInLutMem) { alreadyInLutMem = false; + int lutSlotSize = ArchEthosU85::LUT_SLOT_SIZE; + assert(lutSize % lutSlotSize == 0); + + int sizeInSlots = lutSize / lutSlotSize; int totalSlots = int(lutSlots.size()); if ( sizeInSlots < 0 || sizeInSlots > totalSlots ) { @@ -671,13 +675,14 @@ int EthosU85RCSGenerator::AllocateLutSlot( int allocatedSlot = 0; for ( int i = 0; i < totalSlots; i += sizeInSlots ) { - if ( lutSlots[i].hlcOp == op ) + if ( lutSlots[i].memory == memArea.memory && lutSlots[i].address == address && lutSlots[i].sizeBytes == lutSize ) { // LUT is already in SHRAM allocatedSlot = i; alreadyInLutMem = true; break; } + assert(allocatedSlot < static_cast(lutSlots.size())); if ( lutSlots[i].lastUsed < lutSlots[allocatedSlot].lastUsed ) { allocatedSlot = i; @@ -685,7 +690,9 @@ int EthosU85RCSGenerator::AllocateLutSlot( } for ( int j = allocatedSlot; j < allocatedSlot + sizeInSlots; ++j ) { - lutSlots[j].hlcOp = op; + lutSlots[j].memory = memArea.memory; + lutSlots[j].address = address; + lutSlots[j].sizeBytes = lutSize; lutSlots[j].lastUsed = timestamp; } return allocatedSlot; @@ -949,8 +956,7 @@ void EthosU85RCSGenerator::GeneratePadding(const HLCPadding &padding) void EthosU85RCSGenerator::GenerateActivation(const HLCStripe *stripe, MemoryAccesses &memoryAccesses) { const HLCOperation *op = stripe->operation.get(); - OpType opType = OpType::None; - const HLCParameters *parameters = nullptr; + const HLCSubOperation *activationOp = nullptr; assert(stripe->opGroup != nullptr); EthosU85OpGroup *opGroup = static_cast(stripe->opGroup); auto &ofm = op->ofm; @@ -958,23 +964,18 @@ void EthosU85RCSGenerator::GenerateActivation(const HLCStripe *stripe, MemoryAcc if ( IsActivation(op->type) ) { // Non-fused activation - opType = op->type; - parameters = &op->parameters; + activationOp = op; } - else + else if ( op->subOps.size() > 0 ) { - for ( auto &subOp : op->subOps ) + // Check if the first subOp is a fused activation. + auto &subOp = op->subOps[0]; + if ( opGroup->IsFused(subOp.ifm[0].uid) && IsActivation(subOp.type) ) { - if ( opGroup->IsFused(subOp.ifm[0].uid) && IsActivation(subOp.type) ) - { - // Fused activation - opType = subOp.type; - parameters = &subOp.parameters; - // Use subOp ifm datatype to calculate clip range - clipDataType = subOp.ifm[0].dataType; - // We know there can be only one fused activation - break; - } + // Fused activation + activationOp = &subOp; + // Use subOp ifm datatype to calculate clip range + clipDataType = subOp.ifm[0].dataType; } } @@ -997,13 +998,13 @@ void EthosU85RCSGenerator::GenerateActivation(const HLCStripe *stripe, MemoryAcc auto act = activation_function::LUT_NONE; uint32_t tableIndex = 0; - if ( IsLUTType(opType) ) + if ( activationOp && IsLUTType(activationOp->type) ) { - auto &lutParams = parameters->lut; + auto opType = activationOp->type; + auto &lutParams = activationOp->parameters.lut; int lutSize = lutParams.sizeBytes; - - auto pos = _stripeToLutSlot.find(stripe); - if ( pos != _stripeToLutSlot.end() ) + auto pos = _opToLutSlot.find(activationOp->srcId); + if ( pos != _opToLutSlot.end() ) { tableIndex = pos->second; } @@ -1569,14 +1570,33 @@ void EthosU85RCSGenerator::UpdateMemoryAccesses(const MemoryAccesses &memoryAcce } } +std::unique_ptr EthosU85RCSGenerator::CreateLUTDMA(const HLCSubOperation *op, std::vector &lutSlots, int timestamp) +{ + const auto &lutTens = op->parameters.lut; + bool alreadyInLutMem; + int slot = AllocateLutSlot(lutSlots, lutTens.memArea, lutTens.address, lutTens.sizeBytes, timestamp, alreadyInLutMem); + _opToLutSlot[op->srcId] = slot; + + if ( !alreadyInLutMem ) + { + auto dma = std::make_unique(); + dma->srcMemArea = lutTens.memArea; + dma->srcAddress = lutTens.address; + dma->length = lutTens.sizeBytes; + dma->destMemArea = _arch->LUTMemory(); + dma->destAddress = slot * ArchEthosU85::LUT_SLOT_SIZE; + return dma; + } + return nullptr; +} + // Inserts DMA commands for copying LUTs from constant memory // to LUT memory std::vector> EthosU85RCSGenerator::InsertLUTDMACommands(std::vector> &cmds) { std::vector> result; - int lutSlotSize = ArchEthosU85::LUT_SLOT_SIZE; - int slots = int(_arch->_lutRam->SizeBytes() / lutSlotSize); + int slots = int(_arch->_lutRam->SizeBytes() / ArchEthosU85::LUT_SLOT_SIZE); std::vector lutSlots(slots); int timestamp = 0; result.reserve(cmds.size()); @@ -1587,30 +1607,28 @@ EthosU85RCSGenerator::InsertLUTDMACommands(std::vector(hlc.get()); auto op = stripe->operation; - // TODO MLBEDSW-9142 LUT for chained subOps should be inserted before the primary Op - const auto &subOps = stripe->operation->subOps; - auto lutSubOp = std::find_if( - subOps.begin(), subOps.end(), [](const auto &subOp) { return IsLUTType(subOp.type); }); - if ( IsLUTType(op->type) || (lutSubOp != subOps.end()) ) + + if ( IsLUTType(op->type) ) { - const auto &srcTens = IsLUTType(op->type) ? op->parameters.lut : lutSubOp->parameters.lut; - assert(srcTens.sizeBytes % lutSlotSize == 0); - bool alreadyInLutMem; - int sizeInSlots = srcTens.sizeBytes / lutSlotSize; - int slot = AllocateLutSlot(lutSlots, op.get(), sizeInSlots, timestamp, alreadyInLutMem); - _stripeToLutSlot[stripe] = slot; - - if ( !alreadyInLutMem ) + // Create and insert LUT DMA for a primary op activation + if ( auto dma = CreateLUTDMA(op.get(), lutSlots, timestamp) ) { - auto dma = std::make_unique(); - dma->srcMemArea = srcTens.memArea; - dma->srcAddress = srcTens.address; - dma->length = srcTens.sizeBytes; - dma->destMemArea = _arch->LUTMemory(); - dma->destAddress = slot * lutSlotSize; result.push_back(std::move(dma)); } } + + // Create and insert LUT DMAs for any fused activations in the opgroup + const auto &subOps = stripe->operation->subOps; + for ( auto subOp = subOps.begin(); subOp != subOps.end(); subOp++ ) + { + if ( IsLUTType(subOp->type) ) + { + if ( auto dma = CreateLUTDMA(&(*subOp), lutSlots, timestamp) ) + { + result.push_back(std::move(dma)); + } + } + } } result.push_back(std::move(hlc)); } @@ -2007,7 +2025,7 @@ std::shared_ptr EthosU85RCSGenerator::MakeStripeForSubOp(HLCStripe *s op->type = subOp.type; op->ifm = subOp.ifm; op->ofm = subOp.ofm; - op->_srcId = subOp._srcId; + op->srcId = subOp.srcId; if ( IsLUTType(subOp.type) ) { op->parameters.lut = subOp.parameters.lut; @@ -2099,7 +2117,7 @@ bool EthosU85RCSGenerator::GenerateOpGroup(HLCStripe *stripe, HLCStripe *prevOp, // Return command mapping information to the caller if ( cmdRanges ) { - cmdRanges->emplace_back(stripe->operation->_srcId, emitStart, _emit.Position()); + cmdRanges->emplace_back(stripe->operation->srcId, emitStart, _emit.Position()); } if ( isChained ) @@ -2214,7 +2232,7 @@ std::vector EthosU85RCSGenerator::GenerateCommandStream( std::vector> &highLevelCommandStream, CmdRanges *cmdRanges, bool verbose) { _emit.Clear(); - _stripeToLutSlot.clear(); + _opToLutSlot.clear(); GenerateInitialRegisterSetup(); auto cmds = InsertLUTDMACommands(highLevelCommandStream); cmds = InsertTileDMACommands(cmds); diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.hpp index d37906246c9ec5f09e348767a60cbf7896690a99..a1f7ced2c11802db3d3f2eaaec4e67757165ebd1 100644 --- a/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.hpp +++ b/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.hpp @@ -100,7 +100,9 @@ using MemoryAccesses = std::vector; struct LutSlot { - const HLCOperation *hlcOp = nullptr; + const ArchitectureMemory *memory = nullptr; + Address address = -1; + int sizeBytes = -1; int lastUsed = 0; }; @@ -148,7 +150,8 @@ protected: static int CalcCommandWaits(const MemoryAccesses &opAccesses, std::deque &outstanding); // Returns LUT slot to be used for the given LUT operation. // Sets alreadyInLutMem to true if the LUT is already in SHRAM. - int AllocateLutSlot(std::vector &lutSlots, const HLCOperation *op, int sizeInSlots, int timestamp, bool &alreadyInLutMem); + int AllocateLutSlot(std::vector &lutSlots, const MemArea &memArea, Address address, int lutSize, + int timestamp, bool &alreadyInLutMem); //---------------------------------------------------------------------- // Scaling (OFM/IFM/IFM2_SCALE) //---------------------------------------------------------------------- @@ -209,6 +212,8 @@ protected: void GenerateWaits(bool isKernelWait, const MemoryAccesses &memoryAccesses, std::deque &outstandingAccesses); // Save current memory accesses to accessesToUpdate void UpdateMemoryAccesses(const MemoryAccesses &memoryAccesses, std::deque &accessesToUpdate, int maxWaits); + // Create the LUT DMA command required for the given HLCSubOperation + std::unique_ptr CreateLUTDMA(const HLCSubOperation *op, std::vector &lutSlots, int timestamp); // Inserts DMA commands for copying LUTs from constant memory // to LUT memory std::vector> InsertLUTDMACommands(std::vector> &cmds); @@ -251,8 +256,8 @@ public: private: ArchEthosU85 *_arch; - // For stripes that use LUT: the LUT slot to be used - std::unordered_map _stripeToLutSlot; + // For operations that use LUT: the LUT slot to be used + std::unordered_map _opToLutSlot; EthosU85Emitter _emit; }; diff --git a/ethosu/regor/compiler/high_level_command_stream.hpp b/ethosu/regor/compiler/high_level_command_stream.hpp index 463aea5666adc47d97be4fbdae9177190d57b4c0..7a4ba72379320cc297e022d633748fff2b0249ed 100644 --- a/ethosu/regor/compiler/high_level_command_stream.hpp +++ b/ethosu/regor/compiler/high_level_command_stream.hpp @@ -159,7 +159,7 @@ struct HLCSubOperation std::vector ifm; HLCFeatureMap ofm; HLCParameters parameters = {}; - UniqueId _srcId = 0; + UniqueId srcId = 0; }; /// diff --git a/ethosu/regor/compiler/high_level_command_stream_generator.cpp b/ethosu/regor/compiler/high_level_command_stream_generator.cpp index b661fe34f4dbf3b71a563f99dcbea711d788ca0e..0141d6a4b5a9d61bd74da73aaae9f183693d857d 100644 --- a/ethosu/regor/compiler/high_level_command_stream_generator.cpp +++ b/ethosu/regor/compiler/high_level_command_stream_generator.cpp @@ -311,7 +311,7 @@ static HLCSubOperation MakeSubOperation(const std::unique_ptrOFM(), hlcSubOp.ofm); - hlcSubOp._srcId = schedOp->Uid(); + hlcSubOp.srcId = schedOp->Uid(); if ( schedOp->Type() == OpType::LeakyRelu ) { @@ -337,7 +337,7 @@ static std::shared_ptr MakeOperation(SchedulerOperation *schedOp, op->type = schedOp->Type(); op->kernel = *schedOp->Kernel(); op->config = opInfo->Config(); - op->_srcId = schedOp->Uid(); + op->srcId = schedOp->Uid(); size_t ifms = 0; for ( const auto &input : schedOp->inputs.pairs() ) { @@ -783,7 +783,7 @@ void HLCStreamGenerator::GenerateCommandsForCascade(vector_spansecond.shape; hlcOps[i - 1]->ofm.shape = shape; - // TODO MLBEDSW-9142: support fused activations inside chains + // TODO MLBEDSW-9143: support cascading of chains // for now, we assume maximum one subOp (fused activation) on cascades if ( hlcOps[i - 1]->subOps.size() ) { diff --git a/ethosu/regor/compiler/live_range.cpp b/ethosu/regor/compiler/live_range.cpp index 856973c610f387286eb3a2db18a833ce06b0caaa..8b2bf156e3c66c6a4dcbcee6b95ecca946c4c24d 100644 --- a/ethosu/regor/compiler/live_range.cpp +++ b/ethosu/regor/compiler/live_range.cpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2021-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -76,14 +76,17 @@ void LiveRangeGraph::ExtractLiveRangesFromCascades(const std::vectorOpGroup(); assert(opGroup != nullptr); - if ( opGroup->NeedsAllocation(schedOp->OFM()->tensor->uid) ) + + // Get the ofm of the last operator in the group + auto opGroupOfm = schedOp->SubOps().size() ? schedOp->SubOps().back()->OFM() : schedOp->OFM(); + if ( opGroup->NeedsAllocation(opGroupOfm->tensor->uid) ) { // Check if op have an ifm tensor that can be reused for the ofm - auto ifmTens = ReusableIFM(schedOp, targetMemory); + auto ifmTens = ReusableIFM(schedOp, opGroupOfm, targetMemory); if ( ifmTens != nullptr ) { // ifm can be reused - FuseRanges(ifmTens, schedOp->OFM()->tensor.get()); + FuseRanges(ifmTens, opGroupOfm->tensor.get()); } } } @@ -220,13 +223,16 @@ LiveRange *LiveRangeGraph::FuseRanges(SchedulerTensor *inTens, SchedulerTensor * return lr; } -SchedulerTensor *LiveRangeGraph::ReusableIFM(const std::unique_ptr &schedOp, const MemArea &targetMemory) +// Check if any of the IFMs consumed by the first operator in an opgroup can be reused for the OFM +// tensor of the last operator in the opgroup. +// Requires the first operator to be an elementwise operator and is also applicaple to stand-alone +// elementwise operators (which are just opgroups of length 1). +SchedulerTensor *LiveRangeGraph::ReusableIFM( + const std::unique_ptr &schedOp, const SchedulerConnection *ofmConn, const MemArea &targetMemory) { SchedulerTensor *reusableIfm = nullptr; if ( IsElementwise(schedOp->Type()) ) { - // Check if possible to merge ifm/ofm live ranges of elementwise op - const auto ofmConn = schedOp->OFM(); const auto ofmTens = ofmConn->tensor.get(); if ( !ShouldBeIgnored(ofmTens, targetMemory) ) diff --git a/ethosu/regor/compiler/live_range.hpp b/ethosu/regor/compiler/live_range.hpp index 51048fd1d6154f9292c0fda83297e5334a76eb01..c7b57fca597971353fa4065f729d51ea36a0f9ee 100644 --- a/ethosu/regor/compiler/live_range.hpp +++ b/ethosu/regor/compiler/live_range.hpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2021-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -105,7 +105,8 @@ public: Schedule *schedule, const MemArea &targetMemory, bool addRollingBuffers); LiveRange *GetOrCreateRange(SchedulerTensor *tens); LiveRange *FuseRanges(SchedulerTensor *inTens, SchedulerTensor *outTens); - SchedulerTensor *ReusableIFM(const std::unique_ptr &schedOp, const MemArea &targetMemory); + SchedulerTensor *ReusableIFM(const std::unique_ptr &schedOp, const SchedulerConnection *ofmConn, + const MemArea &targetMemory); virtual bool ShouldBeIgnored(SchedulerTensor *tens, const MemArea &targetMemory); }; diff --git a/ethosu/regor/compiler/scheduler_packing.cpp b/ethosu/regor/compiler/scheduler_packing.cpp index 956daf6c013b5a92c692e74e818ea06e54348e70..2810c792db7da47f806f43a56358429849ed7a23 100644 --- a/ethosu/regor/compiler/scheduler_packing.cpp +++ b/ethosu/regor/compiler/scheduler_packing.cpp @@ -281,10 +281,10 @@ void SchedulerPacking::SchedulerPacking::PackOperations() LOG_TRACE1("Added {} (key {}) to {} (key {})\n", OpTypeToString(nextOp->Type()), key, OpTypeToString(prevOp->Type()), prevOpKey); - // Replace primary op's OFM by nextOp's OFM + // Replace previous op's OFM by nextOp's OFM if ( IsActivation(nextOp->Type()) ) { - auto *ofmConn = primaryOp->OFM(); + auto *ofmConn = prevOp->OFM(); ofmConn->tensor = nextOp->OFM()->tensor; ofmConn->SetType(nextOp->OFM()->Type()); ofmConn->quantization.quantMin = nextOp->Output(TensorUsage::OFM)->quantization.quantMin; @@ -450,7 +450,7 @@ int SchedulerPacking::CanPack(const SchedulerOperation *schedOp, const Scheduler return 0; } - if ( schedOp->OFM()->tensor->isGraphOutput ) + if ( schedOp->OFM()->tensor->isGraphOutput || prevOp->OFM()->tensor->isGraphOutput ) { return 0; }