diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55.cpp index 150890bf2a2b4f7fa4b53f924a6b15cec5dd02b6..d488e56ee51162b05d00f9d099c88b3cf3bbd298 100644 --- a/ethosu/regor/architecture/ethosu55/ethos_u55.cpp +++ b/ethosu/regor/architecture/ethosu55/ethos_u55.cpp @@ -734,7 +734,7 @@ int EthosU55OpGroup::Add(const ArchitectureOpGroupQuery &op, const std::vector &emitted) +void EthosU55RCSGenerator::InsertLUTDMACommand(const HLCStripe *stripe, Temporaries &temps, std::vector &emitted) { int lutSlotSize = _arch->_shram.lutSlotSize; auto op = stripe->operation; @@ -1234,7 +1233,7 @@ void EthosU55RCSGenerator::InsertLUTDMACommand( const auto &lutTens = op->type == OpType::LUT ? op->parameters.lut : op->subOps[0].parameters.lut; assert(config->_layout.lutStart > 0); bool alreadyInLutMem; - int slot = AllocateLutSlot(lutTens.memArea, lutTens.address, lutTens.sizeBytes, index, alreadyInLutMem); + int slot = AllocateLutSlot(lutTens.memArea, lutTens.address, lutTens.sizeBytes, temps.timestamp, alreadyInLutMem); _stripeToLutSlot[stripe] = slot; if ( !alreadyInLutMem ) @@ -1332,7 +1331,10 @@ void EthosU55RCSGenerator::InsertTransposeCommand(const HLCStripe *stripe, Tempo auto &ifm = op->ifm[0]; auto &ofm = op->ofm; - assert(op->subOps.empty()); + bool allowSubOps = DataTypeSizeBits(ofm.dataType) == 8; + bool subOpsRequireLUT = (!op->subOps.empty() && op->subOps[0].type == OpType::LUT); + + assert(op->subOps.empty() || allowSubOps); assert(ifm.dataType == ofm.dataType); assert(((ofm.transpose == TransposeType::NWHC) || !ifm.slice.shape || (ifm.shape == ifm.slice.shape)) && "Implementation cannot be sliced"); ifm.shape = Shape::PadAxes(ifm.shape, 4, 1); @@ -1347,6 +1349,7 @@ void EthosU55RCSGenerator::InsertTransposeCommand(const HLCStripe *stripe, Tempo { LOG_WARN("RCS: Emitting no-op transpose as a memory copy\n"); assert(ifm.format == ofm.format); + assert(op->subOps.empty()); auto dma = std::make_unique(); dma->srcMemArea = ifm.memArea; dma->srcAddress = ifm.address; @@ -1485,9 +1488,14 @@ void EthosU55RCSGenerator::InsertTransposeCommand(const HLCStripe *stripe, Tempo // Create new stripe operations auto cmd = std::make_unique(*stripe); cmd->operation = std::make_shared(); + if ( allowSubOps ) + { + cmd->operation->subOps = op->subOps; + if ( subOpsRequireLUT ) InsertLUTDMACommand(cmd.get(), temps, emitted); + } cmd->operation->kernel = Kernel::UnitKernel(); cmd->operation->type = OpType::AvgPool; - cmd->opGroup = nullptr; + cmd->opGroup = stripe->opGroup; cmd->operation->ifm.push_back(inFM); cmd->operation->ofm = outFM; cmd->ofmArea = outFM.shape; @@ -1634,6 +1642,7 @@ void EthosU55RCSGenerator::InsertMatMulCommand(const HLCStripe *stripe, Temporar // Step 2: REDUCE SUM: TEMP BUFFER -> OFM // Create Reduce sum stripe operation auto sum = std::make_unique(std::make_shared()); + sum->operation->subOps = op->subOps; sum->operation->type = OpType::ReduceSum; sum->operation->kernel = Kernel::UnitKernel(); sum->operation->ifm.push_back(tempFM); @@ -1916,28 +1925,34 @@ void EthosU55RCSGenerator::PrepareCommand(int index, HighLevelCommand *cmd, Temp { emitted.clear(); - if ( cmd->IsStripe() ) + if ( !cmd->IsStripe() ) { - HLCStripe *stripe = static_cast(cmd); - auto op = stripe->operation; - if ( op->type == OpType::Tile ) - { - InsertTileDMACommand(stripe, temps, emitted); - return; // Return early to replace original op - } - else if ( op->type == OpType::LUT || (!op->subOps.empty() && op->subOps[0].type == OpType::LUT) ) - { - InsertLUTDMACommand(index, stripe, temps, emitted); - } - else if ( op->type == OpType::Transpose ) - { - InsertTransposeCommand(stripe, temps, emitted); - return; - } - else if ( op->type == OpType::MatMul ) + // Emit original op + emitted.push_back(cmd); + return; + } + + HLCStripe *stripe = static_cast(cmd); + auto op = stripe->operation; + temps.timestamp = index; + if ( op->type == OpType::Tile ) + { + InsertTileDMACommand(stripe, temps, emitted); + } + else if ( op->type == OpType::Transpose ) + { + InsertTransposeCommand(stripe, temps, emitted); + } + else if ( op->type == OpType::MatMul ) + { + InsertMatMulCommand(stripe, temps, emitted); + } + else + { + // Pre-prepared ops must integrate sub-op lut handling in case the incoming stripe is replaced + if ( (op->type == OpType::LUT) || (!op->subOps.empty() && op->subOps[0].type == OpType::LUT) ) { - InsertMatMulCommand(stripe, temps, emitted); - return; + InsertLUTDMACommand(stripe, temps, emitted); } else if ( _arch->_shram.reservedEndBanks == 0 ) { @@ -1947,10 +1962,9 @@ void EthosU55RCSGenerator::PrepareCommand(int index, HighLevelCommand *cmd, Temp slot = {}; } } + // Emit original op + emitted.push_back(cmd); } - - // Emit original op - emitted.push_back(cmd); } diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.hpp b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.hpp index 662ce49a148eed65e8ce8c17d45e9651f8b2028c..58fe523b803ee9a941b87b45f155cffcd93ea9b0 100644 --- a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.hpp +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.hpp @@ -232,19 +232,19 @@ protected: struct Temporaries { + int timestamp; std::vector> cmds; std::vector> configs; }; // Inserts DMA commands for copying LUTs from constant memory to LUT memory - void InsertLUTDMACommand(int index, const HLCStripe *stripe, Temporaries &temps, std::vector &emitted); + void InsertLUTDMACommand(const HLCStripe *stripe, Temporaries &temps, std::vector &emitted); // Inserts DMA commands to handle TILE operations virtual void InsertTileDMACommand(const HLCStripe *stripe, Temporaries &temps, std::vector &emitted); // Inserts commands to handle transposing virtual void InsertTransposeCommand(const HLCStripe *stripe, Temporaries &temps, std::vector &emitted); // Inserts commands to handle MATMUL operations void InsertMatMulCommand(const HLCStripe *stripe, Temporaries &temps, std::vector &emitted); - //---------------------------------------------------------------------- // Operations //---------------------------------------------------------------------- diff --git a/ethosu/regor/compiler/high_level_command_stream.hpp b/ethosu/regor/compiler/high_level_command_stream.hpp index 7a4ba72379320cc297e022d633748fff2b0249ed..773899a8e2ec920435cedbaffe8642e6aad40a0f 100644 --- a/ethosu/regor/compiler/high_level_command_stream.hpp +++ b/ethosu/regor/compiler/high_level_command_stream.hpp @@ -160,6 +160,18 @@ struct HLCSubOperation HLCFeatureMap ofm; HLCParameters parameters = {}; UniqueId srcId = 0; + HLCSubOperation() = default; + HLCSubOperation(const HLCSubOperation &other) { *this = other; } + void operator=(const HLCSubOperation &other) + { + type = other.type; + ifm = other.ifm; + ofm = other.ofm; + // Compilers disagree on whether the union is copyable. + if ( other.type == OpType::LUT ) parameters.lut = other.parameters.lut; + else parameters.resize = other.parameters.resize; + srcId = other.srcId; + } }; ///