diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp index 11768bf9f06413c5d195cc25b7a380d477355a3d..dfe21622f501bbcd11ce06107017411bf994111b 100644 --- a/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp +++ b/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp @@ -542,13 +542,14 @@ Flags EthosU85Constraints::OperatorQuery(OpType opType, const ArchO // Detailed operator queries if ( opType == OpType::MatMul ) { - // Constrain Matmul height to 1 - if ( ofmShape.Size() > 2 && ofmShape.Height() > 1 ) + // Constrain MatMul Batch to 1 + // Note that MatMul's OFM are effectively rank 3 with dimensions [N, H, W] with a possible leading 1 + if ( ofmShape.Size() > 2 && ofmShape[ofmShape.Size() - 3] > 1 ) { if ( req ) { req->req.Set(ArchRequirement::Decompose); - req->decomposeProps.Set(ArchProperty::TensorAxis); + req->decomposeProps.Set(ArchProperty::TensorDims); } result.Set(QueryResult::HasRequirements); } diff --git a/ethosu/regor/compiler/scheduler_decompose.cpp b/ethosu/regor/compiler/scheduler_decompose.cpp index 0c43470f46a956cb1c65bf26adb2647890566c87..38d9e307c9a7acf48951a205bd551218c7c1aa00 100644 --- a/ethosu/regor/compiler/scheduler_decompose.cpp +++ b/ethosu/regor/compiler/scheduler_decompose.cpp @@ -570,34 +570,58 @@ Slice(SchedulerTensor *tensor, const Shape &offset, const Shape &shape, Shape re } } -static Shape NewOfmBlockShape(Architecture *arch, SchedulerOperation *op) +static Shape MaxOfmShape(Shape &ofmShape, Shape &ifmShape, Kernel &kernel) { - // Find a block shape for decomposition that will fit in accumulator RAM, - // and where ifm also fits. - // GetOpConfig finds a block that fulfills this. - // TODO: MLBEDSW-9860 - // If block decomposition is needed just because of too large ifm/ofm dimension, - // a larger block size could potentially be used. - // For a 1x1 kernel without ifm/ofm size above the limit, no block decomposition is needed, - // as accumulators do not need to be retained. + // Calculate max IFM height and width which can be consumed by a single block + const auto &padding = kernel.Padding(); + int maxIfmHeight = std::min(ifmShape.Height() + padding.Top() + padding.Bottom(), MAX_DIM); + int maxIfmWidth = std::min(ifmShape.Width() + padding.Left() + padding.Right(), MAX_DIM); + // Calculate max OFM dimensions for a single block + int maxOfmHeight = std::min(int((maxIfmHeight - kernel.DilatedWH().y) / kernel.Stride().y) + 1, MAX_DIM); + int maxOfmWidth = std::min(int((maxIfmWidth - kernel.DilatedWH().x) / kernel.Stride().x) + 1, MAX_DIM); + int maxOfmDepth = std::min(ofmShape.Depth(), MAX_DIM); + + return ofmShape.WithHeight(maxOfmHeight).WithWidth(maxOfmWidth).WithDepth(maxOfmDepth); +} + +static Shape NewOfmBlockShape(Architecture *arch, SchedulerOperation *op, const ArchRequirements &req) +{ Shape newBlock; + auto ifmShape = op->IFM(0)->SliceShape(); auto ofmShape = op->OFM()->SliceShape(); auto kernel = *op->Kernel(); - // Get block config for the op after decomposition to smaller kernel - // Avoids problems where a block config can't be found as ifm gets too big for RAM - auto minKernel = kernel.WithSize({1, 1}).WithStride({1, 1}); - op->SetKernel(minKernel); - auto config = GetOpConfig(arch, op); - op->SetKernel(kernel); - assert(config && "No config found."); - if ( !config ) throw DecompositionFailure("No config found"); - auto HW = config->OptimalStripeGranule(); - Shape configBlock = ofmShape.WithBatch(1).WithHW(HW.y, HW.x).WithDepth(config->OptimalDepthGranule()); - if ( Shape::Min(ofmShape, configBlock) != ofmShape ) - { - newBlock = Shape::Min(ofmShape, configBlock); + + // Max OFM shape that can be produced by one op + Shape maxShape = MaxOfmShape(ofmShape, ifmShape, kernel); + + if ( req.decomposeProps.Any(ArchProperty::KernelStride) || ifmShape.Depth() > MAX_DIM ) + { + // Decomposition of strides and/or IFM channels requires decomposing into blocks which + // will fit in accumulator RAM, and where ifm also fits. + // GetOpConfig finds a block that fulfills this. + // Get block config for the op after decomposition to smaller kernel + // Avoids problems where a block config can't be found as ifm gets too big for RAM + auto minKernel = kernel.WithSize({1, 1}).WithStride({1, 1}); + op->SetKernel(minKernel); + auto config = GetOpConfig(arch, op); + op->SetKernel(kernel); + assert(config && "No config found."); + if ( !config ) throw DecompositionFailure("No config found"); + auto HW = config->OptimalStripeGranule(); + Shape configBlock = ofmShape.WithBatch(1).WithHW(HW.y, HW.x).WithDepth(config->OptimalDepthGranule()); + if ( Shape::Min(ofmShape, configBlock) != ofmShape ) + { + newBlock = Shape::Min(ofmShape, configBlock); + } } + else if ( Shape::Min(ofmShape, maxShape) != ofmShape ) + { + // Decompose OFM into blocks that are as large as possible, limited only by constraints + // on Tensor axes of IFM and OFM. + newBlock = Shape::Min(ofmShape, maxShape); + } + return newBlock; } @@ -836,7 +860,10 @@ std::vector> DecomposeConv2D(Architecture *a ofmSlice.Initialize(ofmShape.WithZeros(), ofmShape); ifmSlice.Initialize(ifmShape.WithZeros().WithHW(-padding.Top(), -padding.Left()), ifmShape); - if ( ofmShape.Batch() > 1 ) + ArchRequirements req{}; + Flags qResult = OperatorQuery(arch, op.get(), &req); + + if ( qResult.Any(QueryResult::HasRequirements) && req.decomposeProps.Any(ArchProperty::TensorDims) ) { return DecomposeLeadingDimensions(1, arch, std::move(op), DecomposeConv2D); } @@ -847,13 +874,13 @@ std::vector> DecomposeConv2D(Architecture *a return result; } auto &dilation = kernel->Dilation(); - if ( dilation.x > 1 || dilation.y > 1 ) + if ( req.decomposeProps.Any(ArchProperty::KernelDilation) ) { return HandleDilation(arch, std::move(op), DecomposeConv2D); } try { - if ( auto newBlockShape = NewOfmBlockShape(arch, op.get()) ) + if ( auto newBlockShape = NewOfmBlockShape(arch, op.get(), req) ) { return DecomposeBlocks(arch, std::move(op), newBlockShape, DecomposeConv2D); } @@ -865,8 +892,8 @@ std::vector> DecomposeConv2D(Architecture *a return result; } - if ( arch->Constraints()->SupportsAccumulatorSaveRestore() && - op->Input(TensorUsage::Weights)->tensor->IsConstant() && op->Kernel()->Stride().AreaXY() > 1 ) + if ( arch->Constraints()->SupportsAccumulatorSaveRestore() && req.decomposeProps.Any(ArchProperty::KernelStride) && + op->Input(TensorUsage::Weights)->tensor->IsConstant() ) { return DecomposeForStrides(arch, std::move(op), DecomposeConv2D); } @@ -891,7 +918,10 @@ std::vector> DecomposeConv3D(Architecture *a ofmSlice.Initialize(ofmShape.WithZeros(), ofmShape); ifmSlice.Initialize(ifmShape.WithZeros(), ifmShape); - if ( ofmShape[0] > 1 ) // Batch + ArchRequirements req{}; + Flags qResult = OperatorQuery(arch, op.get(), &req); + + if ( qResult.Any(QueryResult::HasRequirements) && req.decomposeProps.Any(ArchProperty::TensorDims) ) { return DecomposeLeadingDimensions(1, arch, std::move(op), DecomposeConv3D); } @@ -1063,7 +1093,10 @@ std::vector> DecomposeDepthwiseConv2D(Archit ofmSlice.Initialize(ofmShape.WithZeros(), ofmShape); ifmSlice.Initialize(ifmShape.WithZeros().WithHW(-padding.Top(), -padding.Left()), ifmShape); - if ( ofmShape.Batch() > 1 ) + ArchRequirements req{}; + Flags qResult = OperatorQuery(arch, op.get(), &req); + + if ( qResult.Any(QueryResult::HasRequirements) && req.decomposeProps.Any(ArchProperty::TensorDims) ) { return DecomposeLeadingDimensions(1, arch, std::move(op), DecomposeDepthwiseConv2D); } @@ -1169,13 +1202,13 @@ std::vector> DecomposeDepthwiseConv2D(Archit } auto &dilation = kernel->Dilation(); - if ( dilation.x > 1 || dilation.y > 1 ) + if ( req.decomposeProps.Any(ArchProperty::KernelDilation) ) { return HandleDilation(arch, std::move(op), DecomposeDepthwiseConv2D); } try { - if ( auto newBlockShape = NewOfmBlockShape(arch, op.get()) ) + if ( auto newBlockShape = NewOfmBlockShape(arch, op.get(), req) ) { return DecomposeBlocks(arch, std::move(op), newBlockShape, DecomposeDepthwiseConv2D); } @@ -1188,7 +1221,7 @@ std::vector> DecomposeDepthwiseConv2D(Archit } if ( arch->Constraints()->SupportsAccumulatorSaveRestore() && - op->Input(TensorUsage::Weights)->tensor->IsConstant() && op->Kernel()->Stride().AreaXY() > 1 ) + req.decomposeProps.Any(ArchProperty::KernelDilation) && op->Input(TensorUsage::Weights)->tensor->IsConstant() ) { return DecomposeForStrides(arch, std::move(op), DecomposeDepthwiseConv2D); } @@ -1277,7 +1310,10 @@ std::vector> DecomposeTransposeConv2D(Archit ofmSlice.Initialize(ofmShape.WithZeros(), ofmShape); ifmSlice.Initialize(ifmShape.WithZeros(), ifmShape); - if ( ofmShape.Batch() > 1 ) + ArchRequirements req{}; + Flags qResult = OperatorQuery(arch, op.get(), &req); + + if ( qResult.Any(QueryResult::HasRequirements) && req.decomposeProps.Any(ArchProperty::TensorDims) ) { return DecomposeLeadingDimensions(1, arch, std::move(op), DecomposeTransposeConv2D); } @@ -1657,10 +1693,13 @@ std::vector> DecomposeElementwise(Architectu ifm2Slice.Initialize(ifm2Shape.WithZeros(), ifm2Shape); } - auto ofmRank = ofmShape.Size(); - if ( ofmRank > 3 && ofmShape.Elements() > ofmShape.ElementsHWC() ) + + ArchRequirements req{}; + Flags qResult = OperatorQuery(arch, op.get(), &req); + + if ( qResult.Any(QueryResult::HasRequirements) && req.decomposeProps.Any(ArchProperty::TensorDims) ) { - return DecomposeLeadingDimensions(ofmRank - 3, arch, std::move(op), DecomposeElementwise); + return DecomposeLeadingDimensions(ofmShape.Size() - 3, arch, std::move(op), DecomposeElementwise); } if ( auto maxShape = Shape::Min(Shape(nullptr, ofmShape.Size(), MAX_DIM), ofmShape); maxShape != ofmShape ) { @@ -1687,11 +1726,13 @@ std::vector> DecomposeMatmul(Architecture *a ifmSlice.Initialize(ifmShape.WithZeros(), ifmShape); ifm2Slice.Initialize(ifm2Shape.WithZeros(), ifm2Shape); + ArchRequirements req{}; + Flags qResult = OperatorQuery(arch, op.get(), &req); + // Decompose Batching - auto ofmRank = ofmShape.Size(); - if ( ofmRank > 2 && ofmShape.Elements() > ofmShape.ElementsWC() ) + if ( qResult.Any(QueryResult::HasRequirements) && req.decomposeProps.Any(ArchProperty::TensorDims) ) { - return DecomposeLeadingDimensions(ofmRank - 2, arch, std::move(op), DecomposeMatmul); + return DecomposeLeadingDimensions(ofmShape.Size() - 2, arch, std::move(op), DecomposeMatmul); } // Define total dimensions of input and output matrices @@ -2456,10 +2497,13 @@ std::vector> DecomposeAvgPool(Architecture * auto &padding = kernel->Padding(); ofmSlice.Initialize(ofmShape.WithZeros(), ofmShape); ifmSlice.Initialize(ifmShape.WithZeros().WithHW(-padding.Top(), -padding.Left()), ifmShape); - auto ofmRank = ofmShape.Size(); - if ( ofmRank > 3 && (ofmShape.Elements() > ofmShape.ElementsHWC()) ) + + ArchRequirements req{}; + Flags qResult = OperatorQuery(arch, op.get(), &req); + + if ( qResult.Any(QueryResult::HasRequirements) && req.decomposeProps.Any(ArchProperty::TensorDims) ) { - return DecomposeLeadingDimensions(ofmRank - 3, arch, std::move(op), DecomposeAvgPool); + return DecomposeLeadingDimensions(1, arch, std::move(op), DecomposeAvgPool); } if ( !NeedsDecompose(arch, op.get()) ) @@ -2469,9 +2513,6 @@ std::vector> DecomposeAvgPool(Architecture * return result; } - ArchRequirements req{}; - Flags qResult = OperatorQuery(arch, op.get(), &req); - // Perform scaling of the output if needed const int scaleSize = ofmConn->quantization.scales.size(); if ( qResult.Any(QueryResult::HasRequirements) && req.decomposeProps.Any(ArchProperty::Scaling, ArchProperty::KernelStride) && scaleSize ) @@ -2576,7 +2617,7 @@ std::vector> DecomposeAvgPool(Architecture * // Decomposition for large dimensions try { - if ( auto newBlockShape = NewOfmBlockShape(arch, op.get()) ) + if ( auto newBlockShape = NewOfmBlockShape(arch, op.get(), req) ) { return DecomposeBlocks(arch, std::move(op), newBlockShape, DecomposeAvgPool); } @@ -2612,10 +2653,13 @@ std::vector> DecomposeMaxPool(Architecture * auto &padding = kernel->Padding(); ofmSlice.Initialize(ofmShape.WithZeros(), ofmShape); ifmSlice.Initialize(ifmShape.WithZeros().WithHW(-padding.Top(), -padding.Left()), ifmShape); - auto ofmRank = ofmShape.Size(); - if ( ofmRank > 3 && (ofmShape.Elements() > ofmShape.ElementsHWC()) ) + + ArchRequirements req{}; + Flags qResult = OperatorQuery(arch, op.get(), &req); + + if ( qResult.Any(QueryResult::HasRequirements) && req.decomposeProps.Any(ArchProperty::TensorDims) ) { - return DecomposeLeadingDimensions(ofmRank - 3, arch, std::move(op), DecomposeMaxPool); + return DecomposeLeadingDimensions(ofmShape.Size() - 3, arch, std::move(op), DecomposeMaxPool); } if ( !NeedsDecompose(arch, op.get()) ) { @@ -2625,7 +2669,7 @@ std::vector> DecomposeMaxPool(Architecture * } try { - if ( auto newBlockShape = NewOfmBlockShape(arch, op.get()) ) + if ( auto newBlockShape = NewOfmBlockShape(arch, op.get(), req) ) { return DecomposeBlocks(arch, std::move(op), newBlockShape, DecomposeMaxPool); } @@ -2636,7 +2680,7 @@ std::vector> DecomposeMaxPool(Architecture * result.emplace_back(std::move(op)); return result; } - if ( arch->Constraints()->SupportsAccumulatorSaveRestore() && op->Kernel()->Stride().AreaXY() > 1 ) + if ( arch->Constraints()->SupportsAccumulatorSaveRestore() && req.decomposeProps.Any(ArchProperty::KernelStride) ) { return DecomposeForStrides(arch, std::move(op), DecomposeMaxPool); } @@ -2661,12 +2705,8 @@ std::vector> DecomposeResize(Architecture *a ArchRequirements req{}; auto qResult = OperatorQuery(arch, op.get(), &req); - bool decomposeLeadingDims = false; - if ( qResult.Any(QueryResult::HasRequirements) && req.req.Any(ArchRequirement::Decompose) ) - { - decomposeLeadingDims = req.decomposeProps.Any(ArchProperty::TensorDims); - } - if ( decomposeLeadingDims ) + + if ( qResult.Any(QueryResult::HasRequirements) && req.decomposeProps.Any(ArchProperty::TensorDims) ) { return DecomposeLeadingDimensions(ofmShape.Size() - 3, arch, std::move(op), DecomposeResize); } diff --git a/ethosu/regor/test/test_scheduler_decompose.cpp b/ethosu/regor/test/test_scheduler_decompose.cpp index 04568fe197455f82a71807d085d1aae337c4bd14..d5c7b2dd766b3aa2f26234cb142590499a04dba4 100644 --- a/ethosu/regor/test/test_scheduler_decompose.cpp +++ b/ethosu/regor/test/test_scheduler_decompose.cpp @@ -65,7 +65,7 @@ TEST_CASE("test_scheduler_decompose") Shape ifmShape(1, 100, 3, 2); // ifm2 is transposed by graphIR optimiser to same shape as ifm1 Shape ofmShape(1, 100, 3, 3); auto op = CreateOperation(OpType::MatMul, ifmShape, ifmShape, ofmShape); - std::vector> decomposedOps = DecomposeMatmul(nullptr, std::move(op)); + std::vector> decomposedOps = DecomposeMatmul(arch.get(), std::move(op)); REQUIRE(decomposedOps.size() == 100); for ( size_t i = 0; i < decomposedOps.size(); i++ ) { @@ -93,7 +93,7 @@ TEST_CASE("test_scheduler_decompose") op->Input(TensorUsage::IFM0)->slice = {ifmSliceOffset, ifmSliceShape}; op->Input(TensorUsage::IFM1)->slice = {ifmSliceOffset, ifmSliceShape}; op->Output(TensorUsage::OFM)->slice = {ofmSliceOffset, ofmSliceShape}; - std::vector> decomposedOps = DecomposeMatmul(nullptr, std::move(op)); + std::vector> decomposedOps = DecomposeMatmul(arch.get(), std::move(op)); REQUIRE(decomposedOps.size() == 98); for ( size_t i = 0; i < decomposedOps.size(); i++ ) { @@ -114,7 +114,7 @@ TEST_CASE("test_scheduler_decompose") Shape ifmShape(100, 1, 3, 2); Shape ofmShape(100, 1, 3, 3); auto op = CreateOperation(OpType::MatMul, ifmShape, ifmShape, ofmShape); - std::vector> decomposedOps = DecomposeMatmul(nullptr, std::move(op)); + std::vector> decomposedOps = DecomposeMatmul(arch.get(), std::move(op)); REQUIRE(decomposedOps.size() == 100); for ( size_t i = 0; i < decomposedOps.size(); i++ ) { @@ -142,7 +142,7 @@ TEST_CASE("test_scheduler_decompose") op->Input(TensorUsage::IFM0)->slice = {ifmSliceOffset, ifmSliceShape}; op->Input(TensorUsage::IFM1)->slice = {ifmSliceOffset, ifmSliceShape}; op->Output(TensorUsage::OFM)->slice = {ofmSliceOffset, ofmSliceShape}; - std::vector> decomposedOps = DecomposeMatmul(nullptr, std::move(op)); + std::vector> decomposedOps = DecomposeMatmul(arch.get(), std::move(op)); REQUIRE(decomposedOps.size() == 98); for ( size_t i = 0; i < decomposedOps.size(); i++ ) { @@ -163,7 +163,7 @@ TEST_CASE("test_scheduler_decompose") Shape ifmShape(10, 10, 3, 2); Shape ofmShape(10, 10, 3, 3); auto op = CreateOperation(OpType::MatMul, ifmShape, ifmShape, ofmShape); - std::vector> decomposedOps = DecomposeMatmul(nullptr, std::move(op)); + std::vector> decomposedOps = DecomposeMatmul(arch.get(), std::move(op)); REQUIRE(decomposedOps.size() == 100); for ( size_t i = 0; i < decomposedOps.size(); i++ ) { @@ -197,7 +197,7 @@ TEST_CASE("test_scheduler_decompose") op->Input(TensorUsage::IFM0)->slice = {ifmSliceOffset, ifmSliceShape}; op->Input(TensorUsage::IFM1)->slice = {ifmSliceOffset, ifmSliceShape}; op->Output(TensorUsage::OFM)->slice = {ofmSliceOffset, ofmSliceShape}; - std::vector> decomposedOps = DecomposeMatmul(nullptr, std::move(op)); + std::vector> decomposedOps = DecomposeMatmul(arch.get(), std::move(op)); REQUIRE(decomposedOps.size() == 64); for ( size_t i = 0; i < decomposedOps.size(); i++ ) { @@ -226,7 +226,7 @@ TEST_CASE("test_scheduler_decompose") Shape ofmShape(1, 1, 3, 3); auto op = CreateOperation(OpType::MatMul, ifmShape, ifmShape, ofmShape); SchedulerOperation *orig = op.get(); - std::vector> decomposedOps = DecomposeMatmul(nullptr, std::move(op)); + std::vector> decomposedOps = DecomposeMatmul(arch.get(), std::move(op)); REQUIRE(decomposedOps.size() == 1); REQUIRE(orig == decomposedOps[0].get()); }