From 824e8cf4ff9da5d1305b14462118d799ffe16d86 Mon Sep 17 00:00:00 2001 From: Philip Hall Date: Thu, 5 Dec 2024 17:59:04 +0000 Subject: [PATCH] MLBEDSW-10102: Implement Ethos-U55 transposes This commit implements 2-axis transposes for up to rank 3 shapes using AveragePool for both Ethos-U55 and Ethos-U65. - Updated shape stride generator to allow asymmetric rank granules. - Allow null opgroups for substituted stripes in the Ethos-U55 RCS generator. - Implementation also falls back to a memory copy for non-transposes that may reach the RCS. Signed-off-by: Philip Hall Change-Id: I09082e71c931a7a9f433ddb1bdc4ed1128e2e95b --- .../ethosu55/ethos_u55_constraints.cpp | 8 + .../ethos_u55_register_cs_generator.cpp | 189 +++++++++++++++++- .../ethos_u55_register_cs_generator.hpp | 3 +- ethosu/regor/common/numeric_util.hpp | 12 ++ ethosu/regor/common/shape.hpp | 26 ++- ethosu/regor/compiler/kernel.hpp | 2 + ethosu/regor/compiler/scheduler.cpp | 34 +++- ethosu/regor/compiler/scheduler_decompose.cpp | 5 +- ethosu/regor/compiler/scheduler_operation.hpp | 9 +- 9 files changed, 264 insertions(+), 24 deletions(-) diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp index 62caffba..49c66485 100644 --- a/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp @@ -37,6 +37,14 @@ bool EthosU55Constraints::SupportsMatMul(OpType opType) TransposeSupport EthosU55Constraints::SupportsTranspose(OpType opType, TransposeType transposeType) { if ( IsNone(transposeType) ) return TransposeSupport::Any; + + if ( opType == OpType::Transpose ) + { + if ( transposeType == TransposeType::NWHC || transposeType == TransposeType::NHCW || transposeType == TransposeType::NCWH ) + { + return TransposeSupport::NHWC; + } + } return TransposeSupport::None; } diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp index b8a662e3..e7a378ee 100644 --- a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp @@ -368,10 +368,7 @@ TileBox EthosU55RCSGenerator::GetTiles(const HLCFeatureMap &fm, const Shape &str } if ( fm.format == TensorFormat::NHCWB16 ) { - for ( int i = 0; i < 4; ++i ) - { - assert(tiles.address[i] % 16 == 0 && "NHCWB16 base address is not 16-byte aligned"); - } + assert((tiles.address[0] | tiles.address[1] | tiles.address[2] | tiles.address[3]) % 16 == 0 && "NHCWB16 base address is not 16-byte aligned"); } return tiles; } @@ -757,8 +754,6 @@ void EthosU55RCSGenerator::GenerateActivation(const HLCStripe *stripe, MemoryAcc assert(op->subOps.size() <= 1); OpType opType = OpType::None; const HLCParameters *parameters = nullptr; - assert(stripe->opGroup != nullptr); - EthosU55OpGroup *opGroup = static_cast(stripe->opGroup); if ( IsActivation(op->type) ) { // Non-fused activation @@ -766,7 +761,8 @@ void EthosU55RCSGenerator::GenerateActivation(const HLCStripe *stripe, MemoryAcc parameters = &op->parameters; assert(op->subOps.empty() || opType == op->subOps[0].type); } - else if ( !op->subOps.empty() && !opGroup->NeedsAllocation(op->subOps[0].ifm[0].uid) ) + else if ( !op->subOps.empty() && + (stripe->opGroup && !static_cast(stripe->opGroup)->NeedsAllocation(op->subOps[0].ifm[0].uid)) ) { // Fused activation assert(IsActivation(op->subOps[0].type)); @@ -1242,6 +1238,180 @@ void EthosU55RCSGenerator::InsertTileDMACommand(const HLCStripe *stripe, Tempora } } +static inline int FirstSwapped(unsigned transpose, int &from) +{ + unsigned mask = unsigned(transpose) ^ unsigned(TransposeType::None); + for ( int i = 0; i < 8; i++ ) + { + if ( mask & 0xF ) + { + from = (transpose >> (i * 4)) & 0xF; + return i; + } + mask = mask >> 4; + } + from = 0; + return -1; +} + +void EthosU55RCSGenerator::InsertTransposeCommand(const HLCStripe *stripe, Temporaries &temps, std::vector &emitted) +{ + auto op = stripe->operation; + auto &ifm = op->ifm[0]; + auto &ofm = op->ofm; + + assert(op->subOps.empty()); + assert(ifm.format == TensorFormat::NHWC); + assert(ofm.format == TensorFormat::NHWC); + assert(ifm.shape.Size() <= 4); + ifm.shape = Shape::PadAxes(ifm.shape, 4, 0); + Shape outShape = ifm.shape.Permute(unsigned(ofm.transpose)); + + // Which indexed axes have been swapped + unsigned swapMask = unsigned(ofm.transpose) ^ unsigned(TransposeType::None); + unsigned validMask = ofm.shape.ShapeMask(); + bool identity = (swapMask == 0) || (outShape.EqualMask(ofm.shape.WithOnes()) == validMask); + if ( identity ) + { + LOG_WARN("RCS: Emitting no-op transpose as a memory copy\n"); + auto dma = std::make_unique(); + dma->srcMemArea = ifm.memArea; + dma->srcAddress = ifm.address; + dma->length = DataTypeStorageSizeBytes(ifm.dataType, ifm.shape.Elements()); + dma->destMemArea = ofm.memArea; + dma->destAddress = ofm.address; + emitted.push_back(dma.get()); + temps.cmds.push_back(std::move(dma)); + } + else + { + // Strided output on AveragePool can swap Height/Width over any channel depth by + // adjusting the output strides to place the channel arrays in the required layout. + // + // IFM [h_pos, w_pos] = h_pos * ifm_stride_h + w_pos * ifm_stride_w + // OFM [h_pos, w_pos] = h_pos * ofm_stride_w + w_pos * ofm_stride_h (stride has been swapped) + // + // Example: + // Shape (2,3) transposed to Shape (3,2) + // |0|1|2| ifm_stride_w = 1 |0|3| ofm_stride_w = 1 + // |4|5|6| ifm_stride_h = 3 |1|4| ofm_stride_h = 2 + // |2|5| + // + // This can be used to implement any 2-axis channel swap for 3 axes or fewer. + // NWHC - Transpose volume in a single pass. + // NHCW - Transpose 'height' number of CW1 slices. + // NCWH - Transpose 'width' number of HW1 slices (requires extra IFM striding). + HLCFeatureMap inFM = ifm; + HLCFeatureMap outFM = ofm; + + // Only two axis swaps can be achieved using AvgPool + if ( NonZeroNybbles(swapMask) == 2 ) + { + int elementSize = DataTypeSizeBits(ofm.dataType) / 8; + + // Can only swap 2 axes at once using this method + int from; + int to = FirstSwapped(unsigned(ofm.transpose), from); + from = ifm.shape.Size() - 1 - from; + to = ifm.shape.Size() - 1 - to; + + // Place the swappable axes in H/W (works in elements here) + outFM.shape = Shape(1, ifm.shape[from], ifm.shape[to], 1); + int depth = 1, slices = 1; + int ifmStep = 0; + int ofmStep = 0; + + // Not all elements participate in the transposed axes + if ( outFM.shape.Elements() != ifm.shape.Elements() ) + { + if ( ofm.transpose == TransposeType::NWHC ) + { + depth = ifm.shape.Depth(); + slices = 1; + ifmStep = ofmStep = 0; + assert(from == 1 && to == 2); + } + else if ( ofm.transpose == TransposeType::NHCW ) + { + depth = 1; + slices = ifm.shape.Height(); + ifmStep = ofmStep = ifm.shape.Depth() * ifm.shape.Width() * elementSize; + assert(from == 2 && to == 3); + } + else if ( ofm.transpose == TransposeType::NCWH ) + { + depth = 1; + slices = ifm.shape.Width(); + ifmStep = ifm.shape.Depth() * elementSize; + ofmStep = ifm.shape.Height() * elementSize; + assert(from == 1 && to == 3); + } + else + { + assert(false && "Unsupported transpose"); + } + } + + // Recalculate destination as same as source but with output different strides + outFM.shape = Shape(1, ifm.shape[from], ifm.shape[to], depth * elementSize); + inFM.shape = outFM.shape; + // Shapes are measured in terms of bytes, not elements. + outFM.dataType = DataType::Int8; + inFM.dataType = DataType::Int8; + + // Special case for IFM with sparse strides + if ( (slices > 1) && (ofm.transpose == TransposeType::NCWH) ) + { + outFM.strides = Shape(1, elementSize, elementSize * ifm.shape.Width() * ifm.shape.Height(), elementSize); + inFM.strides = Shape(1, elementSize * ifm.shape.Width() * ifm.shape.Depth(), elementSize, elementSize); + } + else + { + outFM.strides = Shape(1, elementSize * depth, elementSize * depth * outFM.shape.Height(), elementSize); + inFM.strides = Shape::GetStridesForShape(inFM.shape, 1); + } + + // Repeat the transpose at advancing offsets for each slice + for ( int i = 0; i < slices; i++ ) + { + // Create new stripe operations + auto cmd = std::make_unique(*stripe); + cmd->operation = std::make_shared(); + cmd->operation->kernel = Kernel::UnitKernel(); + cmd->operation->type = OpType::AvgPool; + cmd->opGroup = nullptr; + cmd->operation->ifm.push_back(inFM); + cmd->operation->ofm = outFM; + cmd->ofmArea = outFM.shape; + cmd->ifmAreas[0] = inFM.shape; + + // Find a common block configuration + if ( i == 0 ) + { + ArchitectureConfigQuery query{}; + query.kernel = &cmd->operation->kernel; + query.ifmBits = DataTypeSizeBits(ifm.dataType); + query.ifmShape[0] = inFM.shape; + query.ofmShape = outFM.shape; + query.ofmFormat = TensorFormat::NHWC; + query.transpose = ofm.transpose; + temps.configs.push_back(_arch->FindBlockConfig(cmd->operation->type, query)); + } + cmd->operation->config = temps.configs.back().get(); + // Add to CMD list + emitted.push_back(cmd.get()); + temps.cmds.push_back(std::move(cmd)); + // Move to next slice + inFM.address += ifmStep; + outFM.address += ofmStep; + } + } + else + { + assert(false && "3-axis swaps must be decomposed"); + } + } +} //---------------------------------------------------------------------- @@ -1497,6 +1667,11 @@ void EthosU55RCSGenerator::PrepareCommand(int index, HighLevelCommand *cmd, Temp { InsertLUTDMACommand(index, stripe, temps, emitted); } + else if ( op->type == OpType::Transpose ) + { + InsertTransposeCommand(stripe, temps, emitted); + return; + } else if ( _arch->_shram.reservedEndBanks == 0 ) { // LUT is overwritten by SHRAM accumulator buffers; clear slots diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.hpp b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.hpp index 26785525..8c01bff9 100644 --- a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.hpp +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.hpp @@ -236,11 +236,12 @@ protected: void InsertLUTDMACommand(int index, const HLCStripe *stripe, Temporaries &temps, std::vector &emitted); // Inserts DMA commands to handle TILE operations virtual void InsertTileDMACommand(const HLCStripe *stripe, Temporaries &temps, std::vector &emitted); + // Inserts commands to handle transposing + virtual void InsertTransposeCommand(const HLCStripe *stripe, Temporaries &temps, std::vector &emitted); //---------------------------------------------------------------------- // Operations //---------------------------------------------------------------------- - struct AccessTracking { std::deque outstandingNpuAccesses; diff --git a/ethosu/regor/common/numeric_util.hpp b/ethosu/regor/common/numeric_util.hpp index d3406ae4..291ada24 100644 --- a/ethosu/regor/common/numeric_util.hpp +++ b/ethosu/regor/common/numeric_util.hpp @@ -376,6 +376,18 @@ constexpr bool IsPowerOfTwo(T x) return x > 0 && (x & (x - 1)) == 0; } +// Count the number of nonzero nybbles +inline unsigned NonZeroNybbles(unsigned mask) +{ + mask |= mask >> 2; // technically & with 0xCCCC and 0x2222 + mask |= mask >> 1; // but bits don't tavel far enough to leak + mask &= 0x11111111; // =...A...A...A...A...A...A...A...A + mask += mask >> 16; // +...x...x...x...x...B...B...B...B + mask += mask >> 8; // +...x...x...x...x...x...x...C...C + mask += mask >> 4; // +...x...x...x...x...x...x...x...D + return mask & 0xF; +}; + template OUT ClampToType(IN x) { diff --git a/ethosu/regor/common/shape.hpp b/ethosu/regor/common/shape.hpp index 3271786c..e6753274 100644 --- a/ethosu/regor/common/shape.hpp +++ b/ethosu/regor/common/shape.hpp @@ -264,11 +264,11 @@ public: // Compute a product of axes from start to end int AxisProduct(int start, int end) const { + if ( end > Size() ) end = Size(); if ( start == end ) return 0; - int tmp = 1; - assert(end <= Size()); int inner = ToOffset(end - 1); int outer = ToOffset(start); + int tmp = 1; for ( int i = inner; i <= outer; i++ ) { tmp *= At(i); @@ -509,6 +509,13 @@ public: unsigned EqualMask(const Shape &other) const { return MinAxisFunc>(*this, other); } + unsigned ShapeMask() const + { + unsigned shift = unsigned(_last); + shift = (shift < 32) ? (31u - shift) : 31u; + return ~0u >> shift; + } + bool IsValid() const { return _last >= 0; } bool IsDynamic() const { return _dynamic; } @@ -724,6 +731,7 @@ public: static Shape PadAxes(const Shape &shape, int axes, int padValue) { + if ( shape.Size() == axes ) return shape; return Shape(shape, std::max(axes, shape.Size()), padValue); } @@ -754,20 +762,26 @@ public: static Shape Wrap(const Shape &a, const Shape &b) { return Shape::MinFunc>(a, b); } + static Shape GetStridesForShape(const Shape &shape, int elementBytes) + { + return GetStridesForShape(shape, Shape(elementBytes)); + } + static Shape GetStridesForShape(const Shape &shape, const Shape &granularity) { - assert(granularity.Size() >= shape.Size()); Shape tmp(nullptr, shape.Size()); if ( shape.IsValid() ) { auto *gran = granularity.Storage(); auto *from = shape.Storage(); auto *result = tmp.Storage(); + int lastGranule = std::min(shape._last, granularity._last); result[0] = gran[0]; - for ( int i = 1; i <= shape._last; i++ ) - { + int i = 1; + for ( ; i <= lastGranule; i++ ) result[i] = ::RoundAway(result[i - 1] * from[i - 1], gran[i]); - } + for ( ; i <= shape._last; i++ ) + result[i] = result[i - 1] * from[i - 1]; } return tmp; } diff --git a/ethosu/regor/compiler/kernel.hpp b/ethosu/regor/compiler/kernel.hpp index 675b3764..a0b8f8f0 100644 --- a/ethosu/regor/compiler/kernel.hpp +++ b/ethosu/regor/compiler/kernel.hpp @@ -158,6 +158,8 @@ public: return fmt::format("size={},{} stride={},{}, dilation={},{} padding={}", _size.x, _size.y, _stride.x, _stride.y, _dilation.x, _dilation.y, _padding.ToString()); } + + static Kernel UnitKernel() { return Kernel({1, 1}, {1, 1}, {1, 1}); } }; static inline int RequiredInputSize(int value, int stride, int border, int upscale, int rounding = 0) diff --git a/ethosu/regor/compiler/scheduler.cpp b/ethosu/regor/compiler/scheduler.cpp index 595e7ee8..5523bd16 100644 --- a/ethosu/regor/compiler/scheduler.cpp +++ b/ethosu/regor/compiler/scheduler.cpp @@ -20,6 +20,7 @@ #include "common/logging.hpp" +#include "architecture/architecture_constraints.hpp" #include "architecture/weight_encoder.hpp" #include "cascade_builder.hpp" #include "common/data_type.hpp" @@ -193,7 +194,7 @@ static bool CheckLinearFormatForConcatSplit(SchedulerTensor *tensor) } -static int UpdateSchedulerTensor(TensorUsage usage, SchedulerConnection *conn) +static int UpdateSchedulerTensor(Architecture *arch, TensorUsage usage, SchedulerConnection *conn) { auto tensor = conn->tensor.get(); @@ -222,6 +223,15 @@ static int UpdateSchedulerTensor(TensorUsage usage, SchedulerConnection *conn) { tensor->needsLinearFormat = true; } + else if ( producer->Type() == OpType::Transpose ) + { + TransposeSupport supported = arch->Constraints()->SupportsTranspose(OpType::Transpose, producer->OFM()->transpose); + if ( supported == TransposeSupport::NHWC ) + { + tensor->needsLinearFormat = true; + } + } + if ( !producer->IsNpuOp() ) { tensor->hasCPUWriters = true; @@ -248,11 +258,20 @@ static int UpdateSchedulerTensor(TensorUsage usage, SchedulerConnection *conn) continue; } // Int32 ReduceSum requires linear format - if ( consumer->Type() == OpType::ReduceSum && tensor->dataType == DataType::Int32 ) + else if ( consumer->Type() == OpType::ReduceSum && tensor->dataType == DataType::Int32 ) { tensor->needsLinearFormat = true; continue; } + else if ( consumer->Type() == OpType::Transpose ) + { + TransposeSupport supported = arch->Constraints()->SupportsTranspose(OpType::Transpose, consumer->OFM()->transpose); + if ( supported == TransposeSupport::NHWC ) + { + tensor->needsLinearFormat = true; + } + } + // Check if consumer shape requires linear format // Brick format can only be used if both shapes have equal W and C // Need to check full shape on connection since tensor might have many producers (concat) @@ -295,23 +314,26 @@ Address Scheduler::CreateSchedulerRepresentation() for ( auto pos : schedOp->outputs.pairs() ) { - opMemoryRequired += UpdateSchedulerTensor(pos.first, &pos.second); + assert(!pos.second.tensor->producers.empty()); + opMemoryRequired += UpdateSchedulerTensor(_arch, pos.first, &pos.second); } for ( auto pos : schedOp->inputs.pairs() ) { - opMemoryRequired += UpdateSchedulerTensor(pos.first, &pos.second); + assert(!pos.second.tensor->consumers.empty()); + opMemoryRequired += UpdateSchedulerTensor(_arch, pos.first, &pos.second); } + for ( auto const &subOp : schedOp->SubOps() ) { for ( auto pos : subOp->outputs.pairs() ) { - opMemoryRequired += UpdateSchedulerTensor(pos.first, &pos.second); + opMemoryRequired += UpdateSchedulerTensor(_arch, pos.first, &pos.second); } for ( auto pos : subOp->inputs.pairs() ) { - opMemoryRequired += UpdateSchedulerTensor(pos.first, &pos.second); + opMemoryRequired += UpdateSchedulerTensor(_arch, pos.first, &pos.second); } } minMemoryRequired = std::max(minMemoryRequired, opMemoryRequired); diff --git a/ethosu/regor/compiler/scheduler_decompose.cpp b/ethosu/regor/compiler/scheduler_decompose.cpp index 9a6053d1..cd3adf05 100644 --- a/ethosu/regor/compiler/scheduler_decompose.cpp +++ b/ethosu/regor/compiler/scheduler_decompose.cpp @@ -220,6 +220,8 @@ bool CanRunOnHardware(Architecture *arch, const SchedulerOperation *schedOp) auto &ofmShape = schedOp->OFM()->SliceShape(); if ( ofmShape.Size() > 3 && ofmShape.Elements() > ofmShape.Width() * ofmShape.Height() * ofmShape.Depth() ) return false; + if ( arch->Constraints()->SupportsTranspose(schedOp->Type(), schedOp->OFM()->transpose) == TransposeSupport::None ) + return false; } auto *ifm = schedOp->TryIFM(0); auto *ifm2 = schedOp->TryIFM(1); @@ -1380,7 +1382,8 @@ std::vector> DecomposeTranspose(Architecture const auto axes = ifmShape.Size(); // We can handle all transpositions in a 3D shape - if ( axes < 4 || ifmShape.Elements() == ifmShape.Height() * ifmShape.Width() * ifmShape.Depth() ) + if ( (axes < 4 || ifmShape.Elements() == ifmShape.Height() * ifmShape.Width() * ifmShape.Depth()) && + arch->Constraints()->SupportsTranspose(op->Type(), ofmConn->transpose) != TransposeSupport::None ) { for ( int axis = 0; axis < axes; axis++ ) { diff --git a/ethosu/regor/compiler/scheduler_operation.hpp b/ethosu/regor/compiler/scheduler_operation.hpp index 02777648..1db3d2d7 100644 --- a/ethosu/regor/compiler/scheduler_operation.hpp +++ b/ethosu/regor/compiler/scheduler_operation.hpp @@ -334,10 +334,13 @@ public: { for ( const auto &item : list->pairs() ) { - auto usage = item.first; const auto &connection = item.second; - auto &vec = IsOFM(usage) ? connection.tensor->producers : connection.tensor->consumers; - vec.erase(std::remove(vec.begin(), vec.end(), this), vec.end()); + if ( connection.tensor ) + { + auto usage = item.first; + auto &vec = IsOFM(usage) ? connection.tensor->producers : connection.tensor->consumers; + vec.erase(std::remove(vec.begin(), vec.end(), this), vec.end()); + } } } inputs.clear(); -- GitLab