diff --git a/ethosu/regor/compiler/scheduler_decompose.cpp b/ethosu/regor/compiler/scheduler_decompose.cpp index 81234b4a98d4fd26e2f4c5e883d54762fc665234..9dafb6ce9f268b3b30add7192667f21bdfaaeaa5 100644 --- a/ethosu/regor/compiler/scheduler_decompose.cpp +++ b/ethosu/regor/compiler/scheduler_decompose.cpp @@ -1082,6 +1082,7 @@ std::vector> DecomposeDepthwiseConv2D(Archit auto subWeightOffset = weightsShape.WithZeros().WithDepth(multiplier); auto subWeightsConn = subOp->Input(TensorUsage::Weights); subWeightsConn->tensor = Slice(weightsConn->tensor.get(), subWeightOffset, subWeightsShape, subWeightsReadShape); + // Tensor is now in AxisOrder::HWCM with M=1 subWeightsConn->tensor->srcTensor->SetAxisOrder(AxisOrder::HWCM); subWeightsConn->tensor->consumers.push_back(subOp.get()); subWeightsConn->shape = subWeightsShape; @@ -1124,19 +1125,52 @@ std::vector> DecomposeDepthwiseConv2D(Archit } return result; } + if ( CanRunOnHardware(arch, op.get()) ) { UpdatePaddingAndIfmOffset(op.get()); result.emplace_back(std::move(op)); return result; } + + // If weight tensor is in AxisOrder::HWCM (with M=1, since depthMultiplier=1 at this point), + // reshape and set AxisOrder::IHWO with I=1, since the rest of the decomposition code + // only handles weight tensors with AxisOrder::OHWI or AxisOrder::IHWO + if ( weightsConn->tensor->srcTensor->AxisOrder() == AxisOrder::HWCM ) + { + auto weightShapeIHWO = weightsConn->SliceShape().Permute(0x0321); + weightsConn->tensor->srcTensor->SetAxisOrder(AxisOrder::IHWO); + weightsConn->tensor->bufferView = weightsConn->tensor->bufferView.Reshape(weightShapeIHWO); + weightsConn->tensor->storageShape = weightShapeIHWO; + weightsConn->shape = weightShapeIHWO; + } + auto &dilation = kernel->Dilation(); if ( dilation.x > 1 || dilation.y > 1 ) { return HandleDilation(arch, std::move(op), DecomposeDepthwiseConv2D); } - // TODO: MLBEDSW-8783 Decompose convolutions with large stride + try + { + if ( auto newBlockShape = NewOfmBlockShape(arch, op.get()) ) + { + return DecomposeBlocks(arch, std::move(op), newBlockShape, DecomposeDepthwiseConv2D); + } + } + catch ( const DecompositionFailure & ) + { + UpdatePaddingAndIfmOffset(op.get()); + result.emplace_back(std::move(op)); + return result; + } + + if ( arch->Constraints()->SupportsAccumulatorSaveRestore() && + op->Input(TensorUsage::Weights)->tensor->IsConstant() && op->Kernel()->Stride().AreaXY() > 1 ) + { + return DecomposeForStrides(arch, std::move(op), DecomposeDepthwiseConv2D); + } // If we get here, decomposition has failed, the resulting operations will be executed on CPU + UpdatePaddingAndIfmOffset(op.get()); result.emplace_back(std::move(op)); return result; }