diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp index 098f9757133dc45ca7637e83badb637e1ba4c564..3bd11ed571bbd8f46a577692da59bb54e45c18a2 100644 --- a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp @@ -1302,8 +1302,7 @@ void EthosU55RCSGenerator::InsertTransposeCommand(const HLCStripe *stripe, Tempo auto &ofm = op->ofm; assert(op->subOps.empty()); - assert(ifm.format == TensorFormat::NHWC); - assert(ofm.format == TensorFormat::NHWC); + assert(ifm.dataType == ofm.dataType); assert(((ofm.transpose == TransposeType::NWHC) || !ifm.slice.shape || (ifm.shape == ifm.slice.shape)) && "Implementation cannot be sliced"); ifm.shape = Shape::PadAxes(ifm.shape, 4, 1); assert((ifm.shape.AxisProduct(0, ifm.shape.Size() - 3) <= 1) && "Batch transposes unsupported"); @@ -1316,10 +1315,16 @@ void EthosU55RCSGenerator::InsertTransposeCommand(const HLCStripe *stripe, Tempo if ( identity ) { LOG_WARN("RCS: Emitting no-op transpose as a memory copy\n"); + assert(ifm.format == ofm.format); auto dma = std::make_unique(); dma->srcMemArea = ifm.memArea; dma->srcAddress = ifm.address; - dma->length = DataTypeStorageSizeBytes(ifm.dataType, ifm.shape.Elements()); + int elements = ofm.shape.Elements(); + if ( ifm.format == TensorFormat::NHCWB16 ) + { + elements = (elements / ofm.shape.Depth()) * RoundAway(ofm.shape.Depth(), 16); + } + dma->length = DataTypeStorageSizeBytes(ofm.dataType, elements); dma->destMemArea = ofm.memArea; dma->destAddress = ofm.address; emitted.push_back(dma.get()); @@ -1327,6 +1332,9 @@ void EthosU55RCSGenerator::InsertTransposeCommand(const HLCStripe *stripe, Tempo } else { + assert(ifm.format == TensorFormat::NHWC); + assert(ofm.format == TensorFormat::NHWC); + // Strided output on AveragePool can swap Height/Width over any channel depth by // adjusting the output strides to place the channel arrays in the required layout. // diff --git a/ethosu/regor/compiler/graphir_optimiser.hpp b/ethosu/regor/compiler/graphir_optimiser.hpp index e54590a9e2a098f7eee679293acc932577b37424..0294ed0177ff3b4c6e48e39fc6de9f649226e05a 100644 --- a/ethosu/regor/compiler/graphir_optimiser.hpp +++ b/ethosu/regor/compiler/graphir_optimiser.hpp @@ -149,7 +149,6 @@ private: &GraphIrOptimiser::RewriteDepthwise, &GraphIrOptimiser::RewriteTransposeConvOFMPadding, &GraphIrOptimiser::OptimiseElementwise, - &GraphIrOptimiser::MergeTransposes, &GraphIrOptimiser::RearrangeTranspose, &GraphIrOptimiser::ReshapeReverse, &GraphIrOptimiser::UnrollConv @@ -160,6 +159,7 @@ private: { {}, { + &GraphIrOptimiser::MergeTransposes, &GraphIrOptimiser::MoveSplitSliceToConsumer } },