diff --git a/ethosu/regor/compiler/graphir_optimiser.cpp b/ethosu/regor/compiler/graphir_optimiser.cpp index de1f6f348eec359705e5de12fbc89e1a97c915c9..644b55be1ab27fc512e17b6c80ee51a7c798af2f 100644 --- a/ethosu/regor/compiler/graphir_optimiser.cpp +++ b/ethosu/regor/compiler/graphir_optimiser.cpp @@ -2122,6 +2122,7 @@ Operation *GraphIrOptimiser::MoveSplitSliceToConsumer(Graph *const, Operation *c auto *consIfm1 = cons->IFM(1); bool ifmShapeEqual = false; + bool bothHaveIfmStride = false; // Don't move to CPU, Reshape or Tile operations // low-level implementation of TILE requires unsliced inputs @@ -2135,12 +2136,24 @@ Operation *GraphIrOptimiser::MoveSplitSliceToConsumer(Graph *const, Operation *c // Check if ifm0 consumer has correct shape auto *consIfm0Conn = cons->Input(TensorUsage::IFM0); ifmShapeEqual = Shape::IsReducedEqual(consIfm0Conn->shape, ofmConn->shape); + + // Check if both ifm and ifm0 consumer have stride + const auto &ifmStride = ifmConn->slice.stride; + const auto &conIfmStride = consIfm0Conn->slice.stride; + bothHaveIfmStride = + ifmStride && ifmStride != ifmStride.WithOnes() && conIfmStride && conIfmStride != conIfmStride.WithOnes(); } else if ( consIfm1 != nullptr && consIfm1 == ofm ) { // Check if ifm1 consumer has correct shape auto *consIfm1Conn = cons->Input(TensorUsage::IFM1); ifmShapeEqual = Shape::IsReducedEqual(consIfm1Conn->shape, ofmConn->shape); + + // Check if both ifm and ifm1 consumer have stride + const auto &ifmStride = ifmConn->slice.stride; + const auto &conIfmStride = consIfm1Conn->slice.stride; + bothHaveIfmStride = + ifmStride && ifmStride != ifmStride.WithOnes() && conIfmStride && conIfmStride != conIfmStride.WithOnes(); } // Calculate the consumer transpose type @@ -2152,7 +2165,7 @@ Operation *GraphIrOptimiser::MoveSplitSliceToConsumer(Graph *const, Operation *c // We can only move to consumer if there is no transpose on the op that we move to, // otherwise the IFM shape may change and transposition will be wrong. - if ( Shape::IsReducedEqual(ofmConn->shape, ofm->StorageShape()) && IsNone(consumerTranspose) && ifmShapeEqual ) + if ( Shape::IsReducedEqual(ofmConn->shape, ofm->StorageShape()) && IsNone(consumerTranspose) && ifmShapeEqual && !bothHaveIfmStride ) { // Split/Slice can be performed by tensor consumer MoveToConsumer(operation, cons.get()); diff --git a/ethosu/regor/compiler/operation.hpp b/ethosu/regor/compiler/operation.hpp index 200a8216aab360f56261ccbd146fcff7959839da..c2a72a9e020ae48f760e4d2675f673aabd43e96e 100644 --- a/ethosu/regor/compiler/operation.hpp +++ b/ethosu/regor/compiler/operation.hpp @@ -49,17 +49,37 @@ enum class RoundMode : uint8_t struct TensorSlice { Shape offset; - Shape shape; + Shape shape; // Shape before striding + Shape stride; + + TensorSlice() {} + TensorSlice(const Shape &offset_, const Shape &shape_) : offset(offset_), shape(shape_) {} + TensorSlice(const Shape &offset_, const Shape &shape_, const Shape &stride_) : + offset(offset_), shape(shape_), stride(stride_) + { + } + // Initialize a TensorSlice if current offset/shape are invalid - void Initialize(const Shape &_offset, const Shape &_shape) + void Initialize(const Shape &offset_, const Shape &shape_) { if ( !shape ) { - shape = _shape; + shape = shape_; } if ( !offset ) { - offset = _offset; + offset = offset_; + } + } + + // Initialize a TensorSlice if current offset/shape/stride are invalid + void Initialize(const Shape &offset_, const Shape &shape_, const Shape &stride_) + { + Initialize(offset_, shape_); + + if ( !stride ) + { + stride = stride_; } } }; diff --git a/ethosu/regor/compiler/scheduler.cpp b/ethosu/regor/compiler/scheduler.cpp index e0df646da5c83873fd254a684f5077253245d9cb..6c74bf3f05375311e8efef5419787e2f5bc6c519 100644 --- a/ethosu/regor/compiler/scheduler.cpp +++ b/ethosu/regor/compiler/scheduler.cpp @@ -205,6 +205,12 @@ int Scheduler::UpdateSchedulerTensor(TensorUsage usage, SchedulerConnection *con conn->requireFullTensor = true; } + // Force linear format for read only tensors + if ( tensor->IsConstant() ) + { + tensor->needsLinearFormat = true; + } + // Force linear output from Reverse for C dimension because brick output from Reverse has special requirements if ( IsOFM(usage) && conn->reverse == ReverseType::C ) { @@ -216,6 +222,12 @@ int Scheduler::UpdateSchedulerTensor(TensorUsage usage, SchedulerConnection *con tensor->needsLinearFormat = true; } + // Force linear format for strided access in the width dimension + if ( conn->stepXY.x != 1 ) + { + tensor->needsLinearFormat = true; + } + for ( auto producer : tensor->producers ) { // TODO: Gather doesn't support brick format yet (MLBEDSW-8410) @@ -323,8 +335,8 @@ int Scheduler::UpdateSchedulerTensor(TensorUsage usage, SchedulerConnection *con tensor->memArea = _arch->OutputFeatureMapMemory(); } - // Set tensor format to NHCWB16 for output FeatureMaps, if possible - if ( IsOFM(usage) ) + // Set tensor format to NHCWB16 for FeatureMaps, if possible + if ( IsIFM(usage) || IsOFM(usage) ) { tensor->format = tensor->needsLinearFormat ? TensorFormat::NHWC : TensorFormat::NHCWB16; } diff --git a/ethosu/regor/compiler/scheduler_packing.cpp b/ethosu/regor/compiler/scheduler_packing.cpp index 3b56b1a3cf5c572b53d52c2ed0177dbf68f685c4..cadef74fad5c41ca0adbdbb0d37b831d33c8856a 100644 --- a/ethosu/regor/compiler/scheduler_packing.cpp +++ b/ethosu/regor/compiler/scheduler_packing.cpp @@ -415,12 +415,17 @@ void SchedulerPacking::InitSchedulerConnection( { schedConn->tensor = tensor; // Convert to (minimum) 4D-shapes in scheduler-IR - schedConn->slice = {Shape::PadAxes(conn.slice.offset, 4, 0), Shape::PadAxes(conn.slice.shape, 4, 1)}; + schedConn->slice = {Shape::PadAxes(conn.slice.offset, 4, 0), Shape::PadAxes(conn.slice.shape, 4, 1), + Shape::PadAxes(conn.slice.stride, 4, 1)}; schedConn->shape = Shape::PadAxes(conn.shape, 4, 1); schedConn->quantization = conn.quantization; schedConn->reverse = conn.reverse; schedConn->resamplingMode = ArchResampling::None; schedConn->rounding = conn.rounding; + if ( schedConn->slice.stride ) + { + schedConn->stepXY = schedConn->slice.stride.WH(); + } } void SchedulerPacking::InitSchedulerTensor(SchedulerTensor *schedTensor, Tensor *tensor, const Graph *graph) diff --git a/ethosu/regor/compiler/tflite_graph_optimiser.cpp b/ethosu/regor/compiler/tflite_graph_optimiser.cpp index 2a1698561d43c9044bec3202ceacd239193477c8..e8fe64432b24a411dbd6265c4e0d18d561e00f41 100644 --- a/ethosu/regor/compiler/tflite_graph_optimiser.cpp +++ b/ethosu/regor/compiler/tflite_graph_optimiser.cpp @@ -733,28 +733,24 @@ Operation *TFLiteGraphOptimiser::RewriteStridedSlice(Graph *const graph, Operati } } - // TODO MLBEDSW-10165: Handle stride != 1 - if ( sliceStride != sliceStride.WithOnes() ) + // TODO MLBEDSW-10165: Handle stride < 0 and other dimensions than H and W + if ( sliceStride.LessMask(sliceStride.WithZeros()) || + sliceStride.WithHeight(1).WithWidth(1) != Shape::PadAxes(sliceShape.WithOnes(), 3, 1) ) { returnOp->SetPassthroughOp(); return returnOp; } - // Adjust resulting shape for stride - sliceShape = Shape::DivRoundUp(sliceShape, sliceStride); - - // Create a new SLICE op - auto sliceOp = std::make_shared(OpType::Slice); - sliceOp->CopyInput(TensorUsage::IFM, *ifmConn); - sliceOp->CopyOutput(TensorUsage::OFM, *ofmConn); - sliceOp->Output(TensorUsage::OFM)->Set(sliceShape); - auto *attr = sliceOp->Attribute(); + // Create a new memory copy op assert(sliceOffset + sliceShape <= ifmConn->shape); assert(sliceOffset >= ifmConn->shape.WithZeros()); - attr->size = sliceShape; - attr->begin = sliceOffset; - RecordOptimisation(operation, sliceOp.get()); - returnOp = sliceOp.get(); + auto copyOp = std::make_shared(OpType::MemoryCopy); + copyOp->CopyInput(TensorUsage::IFM, *ifmConn); + copyOp->Input(TensorUsage::IFM)->Set({sliceOffset, sliceShape, sliceStride}); + copyOp->CopyOutput(TensorUsage::OFM, *ofmConn); + copyOp->Output(TensorUsage::OFM)->Set(Shape::DivRoundUp(sliceShape, sliceStride)); + RecordOptimisation(operation, copyOp.get()); + returnOp = copyOp.get(); // Remove original op operation->Disconnect();