From 981cc4425005b1be73a8a862a8057f6ec91cb37d Mon Sep 17 00:00:00 2001 From: Johan Gunnarsson Date: Fri, 9 May 2025 08:51:03 +0200 Subject: [PATCH] MLBEDSW-9408: Add full REDUCE MIN/MAX/SUM decompostion This implements full decomposition of TOSA REDUCE MIN/MAX/ANY/ALL of the reduced axis. * Extend decomposition to do blockwise reduce operations in the reduced axis. * Add ReduceSum/ReduceMinMax to constraints. * Move reshaping of reduce ops into decomposition. * Move creating a reduce ops kernel to ConvertAttributes. * Remove RewriteReduceMinMaxAnyAll. Signed-off-by: Johan Gunnarsson Change-Id: Ic0873dab1c3c5344045590d1c11724986b896120 --- .../architecture/architecture_constraints.hpp | 2 + .../ethosu55/ethos_u55_constraints.cpp | 16 +- .../ethosu85/ethos_u85_constraints.cpp | 28 +++- ethosu/regor/compiler/graphir_optimiser.cpp | 47 ++---- ethosu/regor/compiler/graphir_optimiser.hpp | 2 - ethosu/regor/compiler/scheduler_decompose.cpp | 151 ++++++++++++++++-- ethosu/regor/compiler/scheduler_packing.cpp | 1 + .../regor/test/test_scheduler_decompose.cpp | 61 ++++++- 8 files changed, 256 insertions(+), 52 deletions(-) diff --git a/ethosu/regor/architecture/architecture_constraints.hpp b/ethosu/regor/architecture/architecture_constraints.hpp index a695aee1..0a84117c 100644 --- a/ethosu/regor/architecture/architecture_constraints.hpp +++ b/ethosu/regor/architecture/architecture_constraints.hpp @@ -53,6 +53,7 @@ struct ArchOperatorQuery ReverseType reverseMask = ReverseType::None; TransposeType transposeMask = TransposeType::None; const Kernel *kernel = nullptr; + int axis = 0; // Uses negative notation: -1 = C, -2 = W, ... ~ArchOperatorQuery(){}; }; @@ -75,6 +76,7 @@ enum class ArchProperty KernelDilation = 1 << 3, DepthMultiplier = 1 << 4, TransposeMask = 1 << 5, + ReduceAxis = 1 << 6, }; struct ArchRequirements diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp index 090530d9..89b061f4 100644 --- a/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp @@ -319,7 +319,7 @@ Flags EthosU55Constraints::OperatorQuery(OpType opType, const ArchO // Check hardware-required substitutions first if ( (opType == OpType::Sigmoid) || (opType == OpType::Tanh) ) { - if ( query->ifm[0].type != DataType::Int16 ) + if ( query && query->ifm[0].type != DataType::Int16 ) { if ( req ) { @@ -368,6 +368,20 @@ Flags EthosU55Constraints::OperatorQuery(OpType opType, const ArchO return QueryResult::NativeConstrained; } + if ( npuOp == EthosU55NpuOp::ReduceSum ) + { + // unsupported reduce axis (only C supported) + if ( query->axis != -1 /* C */ ) + { + if ( req ) + { + req->req.Set(ArchRequirement::Decompose); + req->decomposeProps.Set(ArchProperty::ReduceAxis); + } + result.Set(QueryResult::HasRequirements); + } + } + const auto &ifmShape = query->ifm[0].shape; const auto &ifm2Shape = query->ifm[1].shape; const auto &ofmShape = query->ofm.shape; diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp index b9b72156..4bdb4d43 100644 --- a/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp +++ b/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp @@ -350,8 +350,8 @@ Flags EthosU85Constraints::OperatorQuery(OpType opType, const ArchO { req->req.Set(ArchRequirement::Decompose); req->decomposeProps.Set(ArchProperty::TransposeMask); - result.Set(QueryResult::HasRequirements); } + result.Set(QueryResult::HasRequirements); } else { @@ -372,6 +372,32 @@ Flags EthosU85Constraints::OperatorQuery(OpType opType, const ArchO { return result; } + else if ( npuOp == EthosU85NpuOp::ReduceMinMax ) + { + // unsupported reduce axis (only H and W supported) + if ( query->axis != -3 /* H */ && query->axis != -2 /* W */ ) + { + if ( req ) + { + req->req.Set(ArchRequirement::Decompose); + req->decomposeProps.Set(ArchProperty::ReduceAxis); + } + result.Set(QueryResult::HasRequirements); + } + } + else if ( npuOp == EthosU85NpuOp::ReduceSum ) + { + // unsupported reduce axis (only C supported) + if ( query->axis != -1 /* C */ ) + { + if ( req ) + { + req->req.Set(ArchRequirement::Decompose); + req->decomposeProps.Set(ArchProperty::ReduceAxis); + } + result.Set(QueryResult::HasRequirements); + } + } const auto &ifmShape = query->ifm[0].shape; const auto &ifm2Shape = query->ifm[1].shape; diff --git a/ethosu/regor/compiler/graphir_optimiser.cpp b/ethosu/regor/compiler/graphir_optimiser.cpp index af15538f..c0a919e3 100644 --- a/ethosu/regor/compiler/graphir_optimiser.cpp +++ b/ethosu/regor/compiler/graphir_optimiser.cpp @@ -199,6 +199,22 @@ Operation *GraphIrOptimiser::ConvertAttributes(Graph *const graph, Operation *co assert((mask == ReverseType::None || IsPowerOfTwo(unsigned(mask))) && "Reverse operation can only have one axis"); ofmConn->reverse = mask; } + else if ( opType == OpType::ReduceMin || opType == OpType::ReduceMax || opType == OpType::ReduceAny || opType == OpType::ReduceAll ) + { + TensorConnection *ifmConn = operation->Input(TensorUsage::IFM); + auto *attr = operation->Attribute(); + auto axis = attr->axis; + if ( axis < 0 ) axis = ifmConn->shape.Size() + axis; + assert(axis >= 0); + assert(axis < ifmConn->shape.Size()); + // Create a reduce kernel, if reducing in H or W + Kernel kernel = *operation->Kernel(); + if ( axis == ifmConn->shape.Size() - 3 ) + kernel = operation->Kernel()->WithSize({1 /* W */, ifmConn->shape.Height() /* H */}); + else if ( axis == ifmConn->shape.Size() - 2 ) + kernel = operation->Kernel()->WithSize({ifmConn->shape.Width() /* W */, 1 /* H */}); + operation->SetKernel(std::make_unique(std::move(kernel))); + } return operation; } @@ -1345,36 +1361,6 @@ Operation *GraphIrOptimiser::RewriteSelect(Graph *const graph, Operation *const return returnOp; } -// Rewrite REDUCE_{MIN,MAX,ANY,ALL} IFM/OFM shapes and set a kernel matching the axis to reduce -Operation *GraphIrOptimiser::RewriteReduceMinMaxAnyAll(Graph *const graph, Operation *const operation) -{ - UNUSED(graph); - Operation *returnOp = operation; - const OpType opType = operation->Type(); - if ( opType == OpType::ReduceMin || opType == OpType::ReduceMax || opType == OpType::ReduceAny || opType == OpType::ReduceAll ) - { - auto *ifmConn = operation->Input(TensorUsage::IFM); - auto *ofmConn = operation->Output(TensorUsage::OFM); - auto *attr = operation->Attribute(); - auto axis = attr->axis; - if ( axis < 0 ) axis = ifmConn->shape.Size() + axis; - assert(axis >= 0); - assert(axis < ifmConn->shape.Size()); - - // Reshape IFM/OFM so IFM is HxWxC and OFM is Hx1xC - ifmConn->shape = ReshapeTo3DAroundAxis(ifmConn->shape, axis); - ofmConn->shape = ifmConn->shape.WithWidth(1); - - // Update the axis to reduce to match the reshapes shapes - attr->axis = 1; - - // Set kernel to 1xW (where W is the width of the reshaped shapes) - auto kernel = operation->Kernel()->WithSize({ifmConn->shape.Width() /* W */, 1 /* H */}); - operation->SetKernel(std::make_unique(std::move(kernel))); - } - return returnOp; -} - // Rewrite REDUCE_SUM with any axis into a REDUCE_SUM with C axis Operation *GraphIrOptimiser::RewriteReduceSum(Graph *const graph, Operation *const operation) { @@ -1518,6 +1504,7 @@ Operation *GraphIrOptimiser::RewriteReduceSum(Graph *const graph, Operation *con operation->Input(TensorUsage::IFM)->Set(ifmShape3D); operation->Output(TensorUsage::OFM)->Set(ifmShape3D.WithDepth(1)); + attr->axis = 2; // C } } } diff --git a/ethosu/regor/compiler/graphir_optimiser.hpp b/ethosu/regor/compiler/graphir_optimiser.hpp index bc79c703..59037ce7 100644 --- a/ethosu/regor/compiler/graphir_optimiser.hpp +++ b/ethosu/regor/compiler/graphir_optimiser.hpp @@ -59,7 +59,6 @@ private: Operation *RewriteSlice(Graph *const graph, Operation *const operation); Operation *RewriteNegate(Graph *const graph, Operation *const operation); Operation *RewriteSelect(Graph *const graph, Operation *const operation); - Operation *RewriteReduceMinMaxAnyAll(Graph *const graph, Operation *const operation); Operation *RewriteReduceSum(Graph *const graph, Operation *const operation); Operation *RewriteTile(Graph *const graph, Operation *const operation); Operation *RewriteMatmul(Graph *const graph, Operation *const operation); @@ -147,7 +146,6 @@ private: &GraphIrOptimiser::RewriteConcat, &GraphIrOptimiser::RewriteSlice, &GraphIrOptimiser::RewriteNegate, - &GraphIrOptimiser::RewriteReduceMinMaxAnyAll, &GraphIrOptimiser::RewriteReduceSum, &GraphIrOptimiser::RewriteTile, &GraphIrOptimiser::RewriteMatmul, diff --git a/ethosu/regor/compiler/scheduler_decompose.cpp b/ethosu/regor/compiler/scheduler_decompose.cpp index 09e7b7be..357245e9 100644 --- a/ethosu/regor/compiler/scheduler_decompose.cpp +++ b/ethosu/regor/compiler/scheduler_decompose.cpp @@ -40,6 +40,15 @@ Flags OperatorQuery(Architecture *arch, const SchedulerOperation *s query.transposeMask = ofmConn->transpose; query.reverseMask = ofmConn->reverse; query.kernel = schedOp->Kernel(); + if ( schedOp->HasAttribute() ) + { + query.axis = schedOp->Attribute()->axis; + if ( query.axis >= 0 ) + { + // Convert axis to negative notation + query.axis -= query.ifm[0].shape.Size(); + } + } return arch->Constraints()->OperatorQuery(schedOp->Type(), &query, req); } @@ -1427,27 +1436,141 @@ std::vector> DecomposeReduce(Architecture *a ofmSlice.Initialize(ofmShape.WithZeros(), ofmShape); ifmSlice.Initialize(ifmShape.WithZeros(), ifmShape); - if ( auto ifm2Conn = op->TryInput(TensorUsage::IFM1) ) + const auto ifmRank = ifmShape.Size(); + auto attr = op->Attribute(); + const int reducedAxis = attr->axis; + assert(reducedAxis >= 0); + assert(reducedAxis < ifmRank); + const bool isReduceInH = reducedAxis == ifmRank - 3; + const bool isReduceInW = reducedAxis == ifmRank - 2; + const bool isReduceInC = reducedAxis == ifmRank - 1; + + // Decompose Reduce Min/Max/Sum with the following algorithm so that it can run on NPU. + // + // 1. Reshape the IFM/OFM so that the dimension to reduce is either H, W or C (depending on which type of operation + // it is) and IFM/OFM are 3D shapes. When reshaping >4D shapes, we may lose the slice information, so therefore, + // at this point slicing is not supported. + // 2. Create operations so that the reduced axis is reduced in blocks of 64k, with a final operation to produce the + // results in the original OFM. + + // Figure out what we need to decompose + ArchRequirements req{}; + auto qResult = OperatorQuery(arch, op.get(), &req); + bool decomposeReshape = false; + if ( qResult.Any(QueryResult::HasRequirements) && req.req.Any(ArchRequirement::Decompose) ) { - auto ifm2Shape = ifm2Conn->shape; - auto &ifm2Slice = ifm2Conn->slice; + decomposeReshape = req.decomposeProps.Any(ArchProperty::ReduceAxis, ArchProperty::TensorDims); + } - ifm2Slice.Initialize(ifm2Shape.WithZeros(), ifm2Shape); + // Reshape to a 3D tensor + if ( decomposeReshape ) + { + // Slice offset not supported if we need to reshape + assert(ofmSlice.offset.GreaterMask(ofmSlice.offset.WithZeros()) == 0); + assert(ifmSlice.offset.GreaterMask(ifmSlice.offset.WithZeros()) == 0); + + if ( op->Type() == OpType::ReduceSum ) + { + // ReduceSum can only reduce in C + assert(isReduceInC); + + // Reshape to 3D with all >=H dimensions in H + ifmConn->shape = ReshapeTo3D(ifmConn->shape, {ifmConn->shape.Size() - 2, 1, 1}); + ifmSlice = {}; + ofmConn->shape = ReshapeTo3D(ofmConn->shape, {ofmConn->shape.Size() - 2, 1, 1}); + ofmSlice = {}; + attr->axis = 2; // C + } + else + { + // Reshape to 3D around W + ifmConn->shape = ReshapeTo3DAroundAxis(ifmConn->shape, reducedAxis); + ifmSlice = {}; + ofmConn->shape = ReshapeTo3DAroundAxis(ofmConn->shape, reducedAxis); + ofmSlice = {}; + op->SetKernel(op->Kernel()->WithSize({ifmConn->shape.Width() /* W */, 1 /* H */})); + attr->axis = 1; // W + } + + return DecomposeReduce(arch, std::move(op)); } - auto ofmRank = ofmShape.Size(); - auto attr = op->Attribute(); - int reducedAxis = attr->axis; + // Handle reduced axis + if ( ifmShape[reducedAxis] > MAX_DIM ) + { + // Create an intermediate tensor + const int blockCount = (ifmShape[reducedAxis] - 1) / MAX_DIM + 1; + auto newTensor = ifmConn->tensor->Clone(); + newTensor->srcTensor = nullptr; + newTensor->storageShape = ifmShape.With(reducedAxis, blockCount); - for ( int axis = 0; axis < ofmRank; axis++ ) + LOG_TRACE1("DecomposeReduce: Reduce dimension too large, axis {}, size {}, intermediate shape ({})\n", + reducedAxis, ifmShape[reducedAxis], newTensor->storageShape.ToString()); + + for ( int blockIndex = 0; blockIndex < blockCount; blockIndex++ ) + { + // Create one new reduce op for each block + const int blockSize = std::min(MAX_DIM, ifmShape[reducedAxis] - blockIndex * MAX_DIM); + std::unique_ptr subOp; + Kernel kernel; + if ( isReduceInH ) kernel = op->Kernel()->WithSize({1 /* W */, blockSize /* H */}); + else if ( isReduceInW ) kernel = op->Kernel()->WithSize({blockSize /* W */, 1 /* H */}); + subOp = MakeSubOperation(op.get(), isReduceInC ? nullptr : &kernel); + + auto *subOpIfmConn = subOp->IFM(0); + subOpIfmConn->slice.offset = ifmSlice.offset.With(reducedAxis, blockIndex * MAX_DIM); + subOpIfmConn->slice.shape = ifmSlice.shape.With(reducedAxis, blockSize); + subOpIfmConn->quantization = ifmConn->quantization; + auto *subOpOfmConn = subOp->OFM(); + subOpOfmConn->tensor = newTensor; + subOpOfmConn->shape = newTensor->storageShape; + subOpOfmConn->slice.offset = ofmSlice.offset.With(reducedAxis, blockIndex); + subOpOfmConn->slice.shape = ofmSlice.shape.With(reducedAxis, 1); + subOpOfmConn->quantization = ofmConn->quantization; + newTensor->producers.push_back(subOp.get()); + + LOG_TRACE1("DecomposeReduce: Block, IFM ({}) @ ({}) from ({}), OFM ({}) @ ({}) from ({})\n", + subOpIfmConn->slice.shape.ToString(), subOpIfmConn->slice.offset.ToString(), subOpIfmConn->shape.ToString(), + subOpOfmConn->slice.shape.ToString(), subOpOfmConn->slice.offset.ToString(), subOpOfmConn->shape.ToString()); + + auto subOps = DecomposeReduce(arch, std::move(subOp)); + result.insert(result.end(), std::make_move_iterator(subOps.begin()), std::make_move_iterator(subOps.end())); + } + + // Create one last reduce op that reduces all the blocks + std::unique_ptr subOp; + Kernel kernel; + if ( isReduceInH ) kernel = op->Kernel()->WithSize({1 /* W */, blockCount /* H */}); + else if ( isReduceInW ) kernel = op->Kernel()->WithSize({blockCount /* W */, 1 /* H */}); + subOp = MakeSubOperation(op.get(), isReduceInC ? nullptr : &kernel); + + auto *subOpIfmConn = subOp->IFM(0); + subOpIfmConn->tensor = newTensor; + subOpIfmConn->shape = newTensor->storageShape; + subOpIfmConn->slice.offset = newTensor->storageShape.WithZeros(); + subOpIfmConn->slice.shape = ifmSlice.shape.With(reducedAxis, blockCount); + subOpIfmConn->quantization = Quantization::Unit(); + newTensor->consumers.push_back(subOp.get()); + auto *subOpOfmConn = subOp->OFM(); + subOpOfmConn->quantization = Quantization::Unit(); + + LOG_TRACE1("DecomposeReduce: Final block, IFM ({}) @ ({}) from ({}), OFM ({}) @ ({}) from ({})\n", + subOpIfmConn->slice.shape.ToString(), subOpIfmConn->slice.offset.ToString(), subOpIfmConn->shape.ToString(), + subOpOfmConn->slice.shape.ToString(), subOpOfmConn->slice.offset.ToString(), subOpOfmConn->shape.ToString()); + + auto subOps = DecomposeReduce(arch, std::move(subOp)); + result.insert(result.end(), std::make_move_iterator(subOps.begin()), std::make_move_iterator(subOps.end())); + return result; + } + + // Handle non-reduced axes + for ( int axis = 0; axis < ifmRank; axis++ ) { - if ( ofmShape[axis] > MAX_DIM ) + // At this point the reduced axis should not be too large + assert(ifmShape[axis] <= MAX_DIM || axis != reducedAxis); + + if ( ifmShape[axis] > MAX_DIM ) { - if ( axis == reducedAxis ) - { - // TODO: MLBEDSW-9408 reduced axis requires specific decomposition - continue; - } return DecomposeLargeAxis(axis, MAX_DIM, arch, std::move(op), DecomposeReduce); } } diff --git a/ethosu/regor/compiler/scheduler_packing.cpp b/ethosu/regor/compiler/scheduler_packing.cpp index 69740d3b..2e7b9cbd 100644 --- a/ethosu/regor/compiler/scheduler_packing.cpp +++ b/ethosu/regor/compiler/scheduler_packing.cpp @@ -635,6 +635,7 @@ std::unique_ptr SchedulerPacking::MakeSchedulerOperation(Ope int paddedAxes = schedOp->Output(TensorUsage::OFM)->shape.Size() - op->Output(TensorUsage::OFM)->shape.Size(); assert(paddedAxes >= 0); attr->axis += paddedAxes; + assert(attr->axis < schedOp->Input(TensorUsage::IFM)->shape.Size()); } // Update OFM transpose mask if operator has the attribute else if ( schedOp->HasAttribute() ) diff --git a/ethosu/regor/test/test_scheduler_decompose.cpp b/ethosu/regor/test/test_scheduler_decompose.cpp index 7c4d327d..04568fe1 100644 --- a/ethosu/regor/test/test_scheduler_decompose.cpp +++ b/ethosu/regor/test/test_scheduler_decompose.cpp @@ -18,6 +18,7 @@ #include "common/common.hpp" +#include "architecture/ethosu85/ethos_u85.hpp" #include "compiler/scheduler_decompose.hpp" #include "util.hpp" @@ -57,6 +58,8 @@ std::unique_ptr CreateOperation(OpType opType, Shape ifmShap TEST_CASE("test_scheduler_decompose") { + auto arch = CreateArchDefault(1024); + SECTION("Decompose matmul in height dimension") { Shape ifmShape(1, 100, 3, 2); // ifm2 is transposed by graphIR optimiser to same shape as ifm1 @@ -227,14 +230,15 @@ TEST_CASE("test_scheduler_decompose") REQUIRE(decomposedOps.size() == 1); REQUIRE(orig == decomposedOps[0].get()); } - SECTION("Decompose large axis") + SECTION("Decompose reduce large axis (non-reduced axis)") { uint32_t maxSize = (1UL << 16); uint32_t shapeSize = maxSize * 10 + 5; Shape ifmShape(1, 1, shapeSize, 5); Shape ofmShape(1, 1, shapeSize, 5); auto op = CreateOperation(OpType::ReduceMax, ifmShape, ofmShape); - std::vector> decomposedOps = DecomposeReduce(nullptr, std::move(op)); + op->Attribute()->axis = 1; // H + std::vector> decomposedOps = DecomposeReduce(arch.get(), std::move(op)); REQUIRE(decomposedOps.size() == 11); for ( size_t i = 0; i < decomposedOps.size(); i++ ) { @@ -246,7 +250,7 @@ TEST_CASE("test_scheduler_decompose") REQUIRE(ofmSlice.shape == ofmShape.WithWidth(expectedWidth)); } } - SECTION("Decompose large axis (sliced)") + SECTION("Decompose reduce large axis (non-reduced axis, sliced)") { uint32_t maxSize = (1UL << 16); uint32_t shapeSize = maxSize * 10 + 5; @@ -263,9 +267,10 @@ TEST_CASE("test_scheduler_decompose") Shape ifmSliceShape(1, 1, maxSize * 2 + 7, 10); Shape ofmSliceShape(1, 1, maxSize * 2 + 7, 10); auto op = CreateOperation(OpType::ReduceMax, ifmShape, ofmShape); + op->Attribute()->axis = 1; // H op->Input(TensorUsage::IFM0)->slice = {ifmSliceOffset, ifmSliceShape}; op->Output(TensorUsage::OFM)->slice = {ofmSliceOffset, ofmSliceShape}; - std::vector> decomposedOps = DecomposeReduce(nullptr, std::move(op)); + std::vector> decomposedOps = DecomposeReduce(arch.get(), std::move(op)); REQUIRE(decomposedOps.size() == 3); for ( size_t i = 0; i < decomposedOps.size(); i++ ) { @@ -279,4 +284,52 @@ TEST_CASE("test_scheduler_decompose") REQUIRE(ifmSlice.offset == (ifmSliceOffset + Shape(0, 0, i * maxSize, 0))); } } + SECTION("Decompose reduce large axis (reduced axis)") + { + int maxSize = (1UL << 16); + int shapeSize = maxSize * 10 + 5; + Shape ifmShape(1, 1, shapeSize, 5); + Shape ofmShape(1, 1, 1, 5); + auto op = CreateOperation(OpType::ReduceMax, ifmShape, ofmShape); + op->Attribute()->axis = 2; // W + std::vector> decomposedOps = DecomposeReduce(arch.get(), std::move(op)); + REQUIRE(decomposedOps.size() == 12); + for ( int i = 0; i < int(decomposedOps.size()) - 1; i++ ) + { + // Check each block + auto &subOp = decomposedOps[i]; + auto &ifmSlice = subOp->Input(TensorUsage::IFM0)->slice; + auto &ofmSlice = subOp->Output(TensorUsage::OFM)->slice; + int blockSize = std::min(maxSize, shapeSize - i * maxSize); + REQUIRE(ifmSlice.shape == ifmShape.WithWidth(blockSize)); + REQUIRE(ofmSlice.shape == ofmShape.WithWidth(1)); + REQUIRE(ifmSlice.offset == ifmShape.WithZeros().WithWidth(i * maxSize)); + REQUIRE(ofmSlice.offset == ofmShape.WithZeros().WithWidth(i)); + } + // Check final reduce + auto &subOp = decomposedOps.back(); + auto &ifmSlice = subOp->Input(TensorUsage::IFM0)->slice; + auto &ofmSlice = subOp->Output(TensorUsage::OFM)->slice; + int blockCount = decomposedOps.size() - 1; + REQUIRE(ifmSlice.shape == ifmShape.WithWidth(blockCount)); + REQUIRE(ofmSlice.shape == ofmShape); + REQUIRE(ifmSlice.offset == ifmShape.WithZeros()); + REQUIRE(ofmSlice.offset == ofmShape.WithZeros()); + } + SECTION("Decompose reduce with batch dimension") + { + Shape ifmShape(3, 7, 11, 13); + Shape ofmShape(3, 7, 11, 1); + auto op = CreateOperation(OpType::ReduceMax, ifmShape, ofmShape); + op->Attribute()->axis = 3; // C + std::vector> decomposedOps = DecomposeReduce(arch.get(), std::move(op)); + REQUIRE(decomposedOps.size() == 1); + auto &subOp = decomposedOps[0]; + auto &ifmSlice = subOp->Input(TensorUsage::IFM0)->slice; + auto &ofmSlice = subOp->Output(TensorUsage::OFM)->slice; + REQUIRE(ifmSlice.shape == Shape(3 * 7 * 11, 13, 1)); + REQUIRE(ofmSlice.shape == Shape(3 * 7 * 11, 1, 1)); + REQUIRE(ifmSlice.offset == Shape(0, 0, 0)); + REQUIRE(ofmSlice.offset == Shape(0, 0, 0)); + } } -- GitLab