From 981cc4425005b1be73a8a862a8057f6ec91cb37d Mon Sep 17 00:00:00 2001
From: Johan Gunnarsson <johan.gunnarsson@arm.com>
Date: Fri, 9 May 2025 08:51:03 +0200
Subject: [PATCH] MLBEDSW-9408: Add full REDUCE MIN/MAX/SUM decompostion

This implements full decomposition of TOSA REDUCE MIN/MAX/ANY/ALL
of the reduced axis.

* Extend decomposition to do blockwise reduce operations in the
  reduced axis.
* Add ReduceSum/ReduceMinMax to constraints.
* Move reshaping of reduce ops into decomposition.
* Move creating a reduce ops kernel to ConvertAttributes.
* Remove RewriteReduceMinMaxAnyAll.

Signed-off-by: Johan Gunnarsson <johan.gunnarsson@arm.com>
Change-Id: Ic0873dab1c3c5344045590d1c11724986b896120
---
 .../architecture/architecture_constraints.hpp |   2 +
 .../ethosu55/ethos_u55_constraints.cpp        |  16 +-
 .../ethosu85/ethos_u85_constraints.cpp        |  28 +++-
 ethosu/regor/compiler/graphir_optimiser.cpp   |  47 ++----
 ethosu/regor/compiler/graphir_optimiser.hpp   |   2 -
 ethosu/regor/compiler/scheduler_decompose.cpp | 151 ++++++++++++++++--
 ethosu/regor/compiler/scheduler_packing.cpp   |   1 +
 .../regor/test/test_scheduler_decompose.cpp   |  61 ++++++-
 8 files changed, 256 insertions(+), 52 deletions(-)
diff --git a/ethosu/regor/architecture/architecture_constraints.hpp b/ethosu/regor/architecture/architecture_constraints.hpp
index a695aee1..0a84117c 100644
--- a/ethosu/regor/architecture/architecture_constraints.hpp
+++ b/ethosu/regor/architecture/architecture_constraints.hpp
@@ -53,6 +53,7 @@ struct ArchOperatorQuery
     ReverseType reverseMask = ReverseType::None;
     TransposeType transposeMask = TransposeType::None;
     const Kernel *kernel = nullptr;
+    int axis = 0;  // Uses negative notation: -1 = C, -2 = W, ...
     ~ArchOperatorQuery(){};
 };
 
@@ -75,6 +76,7 @@ enum class ArchProperty
     KernelDilation = 1 << 3,
     DepthMultiplier = 1 << 4,
     TransposeMask = 1 << 5,
+    ReduceAxis = 1 << 6,
 };
 
 struct ArchRequirements
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp
index 090530d9..89b061f4 100644
--- a/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp
@@ -319,7 +319,7 @@ Flags<QueryResult> EthosU55Constraints::OperatorQuery(OpType opType, const ArchO
     // Check hardware-required substitutions first
     if ( (opType == OpType::Sigmoid) || (opType == OpType::Tanh) )
     {
-        if ( query->ifm[0].type != DataType::Int16 )
+        if ( query && query->ifm[0].type != DataType::Int16 )
         {
             if ( req )
             {
@@ -368,6 +368,20 @@ Flags<QueryResult> EthosU55Constraints::OperatorQuery(OpType opType, const ArchO
         return QueryResult::NativeConstrained;
     }
 
+    if ( npuOp == EthosU55NpuOp::ReduceSum )
+    {
+        // unsupported reduce axis (only C supported)
+        if ( query->axis != -1 /* C */ )
+        {
+            if ( req )
+            {
+                req->req.Set(ArchRequirement::Decompose);
+                req->decomposeProps.Set(ArchProperty::ReduceAxis);
+            }
+            result.Set(QueryResult::HasRequirements);
+        }
+    }
+
     const auto &ifmShape = query->ifm[0].shape;
     const auto &ifm2Shape = query->ifm[1].shape;
     const auto &ofmShape = query->ofm.shape;
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp
index b9b72156..4bdb4d43 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp
@@ -350,8 +350,8 @@ Flags<QueryResult> EthosU85Constraints::OperatorQuery(OpType opType, const ArchO
                 {
                     req->req.Set(ArchRequirement::Decompose);
                     req->decomposeProps.Set(ArchProperty::TransposeMask);
-                    result.Set(QueryResult::HasRequirements);
                 }
+                result.Set(QueryResult::HasRequirements);
             }
             else
             {
@@ -372,6 +372,32 @@ Flags<QueryResult> EthosU85Constraints::OperatorQuery(OpType opType, const ArchO
     {
         return result;
     }
+    else if ( npuOp == EthosU85NpuOp::ReduceMinMax )
+    {
+        // unsupported reduce axis (only H and W supported)
+        if ( query->axis != -3 /* H */ && query->axis != -2 /* W */ )
+        {
+            if ( req )
+            {
+                req->req.Set(ArchRequirement::Decompose);
+                req->decomposeProps.Set(ArchProperty::ReduceAxis);
+            }
+            result.Set(QueryResult::HasRequirements);
+        }
+    }
+    else if ( npuOp == EthosU85NpuOp::ReduceSum )
+    {
+        // unsupported reduce axis (only C supported)
+        if ( query->axis != -1 /* C */ )
+        {
+            if ( req )
+            {
+                req->req.Set(ArchRequirement::Decompose);
+                req->decomposeProps.Set(ArchProperty::ReduceAxis);
+            }
+            result.Set(QueryResult::HasRequirements);
+        }
+    }
 
     const auto &ifmShape = query->ifm[0].shape;
     const auto &ifm2Shape = query->ifm[1].shape;
diff --git a/ethosu/regor/compiler/graphir_optimiser.cpp b/ethosu/regor/compiler/graphir_optimiser.cpp
index af15538f..c0a919e3 100644
--- a/ethosu/regor/compiler/graphir_optimiser.cpp
+++ b/ethosu/regor/compiler/graphir_optimiser.cpp
@@ -199,6 +199,22 @@ Operation *GraphIrOptimiser::ConvertAttributes(Graph *const graph, Operation *co
         assert((mask == ReverseType::None || IsPowerOfTwo(unsigned(mask))) && "Reverse operation can only have one axis");
         ofmConn->reverse = mask;
     }
+    else if ( opType == OpType::ReduceMin || opType == OpType::ReduceMax || opType == OpType::ReduceAny || opType == OpType::ReduceAll )
+    {
+        TensorConnection *ifmConn = operation->Input(TensorUsage::IFM);
+        auto *attr = operation->Attribute<axis_attr_t>();
+        auto axis = attr->axis;
+        if ( axis < 0 ) axis = ifmConn->shape.Size() + axis;
+        assert(axis >= 0);
+        assert(axis < ifmConn->shape.Size());
+        // Create a reduce kernel, if reducing in H or W
+        Kernel kernel = *operation->Kernel();
+        if ( axis == ifmConn->shape.Size() - 3 )
+            kernel = operation->Kernel()->WithSize({1 /* W */, ifmConn->shape.Height() /* H */});
+        else if ( axis == ifmConn->shape.Size() - 2 )
+            kernel = operation->Kernel()->WithSize({ifmConn->shape.Width() /* W */, 1 /* H */});
+        operation->SetKernel(std::make_unique<Kernel>(std::move(kernel)));
+    }
     return operation;
 }
 
@@ -1345,36 +1361,6 @@ Operation *GraphIrOptimiser::RewriteSelect(Graph *const graph, Operation *const
     return returnOp;
 }
 
-// Rewrite REDUCE_{MIN,MAX,ANY,ALL} IFM/OFM shapes and set a kernel matching the axis to reduce
-Operation *GraphIrOptimiser::RewriteReduceMinMaxAnyAll(Graph *const graph, Operation *const operation)
-{
-    UNUSED(graph);
-    Operation *returnOp = operation;
-    const OpType opType = operation->Type();
-    if ( opType == OpType::ReduceMin || opType == OpType::ReduceMax || opType == OpType::ReduceAny || opType == OpType::ReduceAll )
-    {
-        auto *ifmConn = operation->Input(TensorUsage::IFM);
-        auto *ofmConn = operation->Output(TensorUsage::OFM);
-        auto *attr = operation->Attribute<axis_attr_t>();
-        auto axis = attr->axis;
-        if ( axis < 0 ) axis = ifmConn->shape.Size() + axis;
-        assert(axis >= 0);
-        assert(axis < ifmConn->shape.Size());
-
-        // Reshape IFM/OFM so IFM is HxWxC and OFM is Hx1xC
-        ifmConn->shape = ReshapeTo3DAroundAxis(ifmConn->shape, axis);
-        ofmConn->shape = ifmConn->shape.WithWidth(1);
-
-        // Update the axis to reduce to match the reshapes shapes
-        attr->axis = 1;
-
-        // Set kernel to 1xW (where W is the width of the reshaped shapes)
-        auto kernel = operation->Kernel()->WithSize({ifmConn->shape.Width() /* W */, 1 /* H */});
-        operation->SetKernel(std::make_unique<Kernel>(std::move(kernel)));
-    }
-    return returnOp;
-}
-
 // Rewrite REDUCE_SUM with any axis into a REDUCE_SUM with C axis
 Operation *GraphIrOptimiser::RewriteReduceSum(Graph *const graph, Operation *const operation)
 {
@@ -1518,6 +1504,7 @@ Operation *GraphIrOptimiser::RewriteReduceSum(Graph *const graph, Operation *con
 
                 operation->Input(TensorUsage::IFM)->Set(ifmShape3D);
                 operation->Output(TensorUsage::OFM)->Set(ifmShape3D.WithDepth(1));
+                attr->axis = 2;  // C
             }
         }
     }
diff --git a/ethosu/regor/compiler/graphir_optimiser.hpp b/ethosu/regor/compiler/graphir_optimiser.hpp
index bc79c703..59037ce7 100644
--- a/ethosu/regor/compiler/graphir_optimiser.hpp
+++ b/ethosu/regor/compiler/graphir_optimiser.hpp
@@ -59,7 +59,6 @@ private:
     Operation *RewriteSlice(Graph *const graph, Operation *const operation);
     Operation *RewriteNegate(Graph *const graph, Operation *const operation);
     Operation *RewriteSelect(Graph *const graph, Operation *const operation);
-    Operation *RewriteReduceMinMaxAnyAll(Graph *const graph, Operation *const operation);
     Operation *RewriteReduceSum(Graph *const graph, Operation *const operation);
     Operation *RewriteTile(Graph *const graph, Operation *const operation);
     Operation *RewriteMatmul(Graph *const graph, Operation *const operation);
@@ -147,7 +146,6 @@ private:
                 &GraphIrOptimiser::RewriteConcat,
                 &GraphIrOptimiser::RewriteSlice,
                 &GraphIrOptimiser::RewriteNegate,
-                &GraphIrOptimiser::RewriteReduceMinMaxAnyAll,
                 &GraphIrOptimiser::RewriteReduceSum,
                 &GraphIrOptimiser::RewriteTile,
                 &GraphIrOptimiser::RewriteMatmul,
diff --git a/ethosu/regor/compiler/scheduler_decompose.cpp b/ethosu/regor/compiler/scheduler_decompose.cpp
index 09e7b7be..357245e9 100644
--- a/ethosu/regor/compiler/scheduler_decompose.cpp
+++ b/ethosu/regor/compiler/scheduler_decompose.cpp
@@ -40,6 +40,15 @@ Flags<QueryResult> OperatorQuery(Architecture *arch, const SchedulerOperation *s
     query.transposeMask = ofmConn->transpose;
     query.reverseMask = ofmConn->reverse;
     query.kernel = schedOp->Kernel();
+    if ( schedOp->HasAttribute<axis_attr_t>() )
+    {
+        query.axis = schedOp->Attribute<axis_attr_t>()->axis;
+        if ( query.axis >= 0 )
+        {
+            // Convert axis to negative notation
+            query.axis -= query.ifm[0].shape.Size();
+        }
+    }
     return arch->Constraints()->OperatorQuery(schedOp->Type(), &query, req);
 }
 
@@ -1427,27 +1436,141 @@ std::vector<std::unique_ptr<SchedulerOperation>> DecomposeReduce(Architecture *a
     ofmSlice.Initialize(ofmShape.WithZeros(), ofmShape);
     ifmSlice.Initialize(ifmShape.WithZeros(), ifmShape);
 
-    if ( auto ifm2Conn = op->TryInput(TensorUsage::IFM1) )
+    const auto ifmRank = ifmShape.Size();
+    auto attr = op->Attribute<axis_attr_t>();
+    const int reducedAxis = attr->axis;
+    assert(reducedAxis >= 0);
+    assert(reducedAxis < ifmRank);
+    const bool isReduceInH = reducedAxis == ifmRank - 3;
+    const bool isReduceInW = reducedAxis == ifmRank - 2;
+    const bool isReduceInC = reducedAxis == ifmRank - 1;
+
+    // Decompose Reduce Min/Max/Sum with the following algorithm so that it can run on NPU.
+    //
+    // 1. Reshape the IFM/OFM so that the dimension to reduce is either H, W or C (depending on which type of operation
+    //    it is) and IFM/OFM are 3D shapes. When reshaping >4D shapes, we may lose the slice information, so therefore,
+    //    at this point slicing is not supported.
+    // 2. Create operations so that the reduced axis is reduced in blocks of 64k, with a final operation to produce the
+    //    results in the original OFM.
+
+    // Figure out what we need to decompose
+    ArchRequirements req{};
+    auto qResult = OperatorQuery(arch, op.get(), &req);
+    bool decomposeReshape = false;
+    if ( qResult.Any(QueryResult::HasRequirements) && req.req.Any(ArchRequirement::Decompose) )
     {
-        auto ifm2Shape = ifm2Conn->shape;
-        auto &ifm2Slice = ifm2Conn->slice;
+        decomposeReshape = req.decomposeProps.Any(ArchProperty::ReduceAxis, ArchProperty::TensorDims);
+    }
 
-        ifm2Slice.Initialize(ifm2Shape.WithZeros(), ifm2Shape);
+    // Reshape to a 3D tensor
+    if ( decomposeReshape )
+    {
+        // Slice offset not supported if we need to reshape
+        assert(ofmSlice.offset.GreaterMask(ofmSlice.offset.WithZeros()) == 0);
+        assert(ifmSlice.offset.GreaterMask(ifmSlice.offset.WithZeros()) == 0);
+
+        if ( op->Type() == OpType::ReduceSum )
+        {
+            // ReduceSum can only reduce in C
+            assert(isReduceInC);
+
+            // Reshape to 3D with all >=H dimensions in H
+            ifmConn->shape = ReshapeTo3D(ifmConn->shape, {ifmConn->shape.Size() - 2, 1, 1});
+            ifmSlice = {};
+            ofmConn->shape = ReshapeTo3D(ofmConn->shape, {ofmConn->shape.Size() - 2, 1, 1});
+            ofmSlice = {};
+            attr->axis = 2;  // C
+        }
+        else
+        {
+            // Reshape to 3D around W
+            ifmConn->shape = ReshapeTo3DAroundAxis(ifmConn->shape, reducedAxis);
+            ifmSlice = {};
+            ofmConn->shape = ReshapeTo3DAroundAxis(ofmConn->shape, reducedAxis);
+            ofmSlice = {};
+            op->SetKernel(op->Kernel()->WithSize({ifmConn->shape.Width() /* W */, 1 /* H */}));
+            attr->axis = 1;  // W
+        }
+
+        return DecomposeReduce(arch, std::move(op));
     }
 
-    auto ofmRank = ofmShape.Size();
-    auto attr = op->Attribute<axis_attr_t>();
-    int reducedAxis = attr->axis;
+    // Handle reduced axis
+    if ( ifmShape[reducedAxis] > MAX_DIM )
+    {
+        // Create an intermediate tensor
+        const int blockCount = (ifmShape[reducedAxis] - 1) / MAX_DIM + 1;
+        auto newTensor = ifmConn->tensor->Clone();
+        newTensor->srcTensor = nullptr;
+        newTensor->storageShape = ifmShape.With(reducedAxis, blockCount);
 
-    for ( int axis = 0; axis < ofmRank; axis++ )
+        LOG_TRACE1("DecomposeReduce: Reduce dimension too large, axis {}, size {}, intermediate shape ({})\n",
+            reducedAxis, ifmShape[reducedAxis], newTensor->storageShape.ToString());
+
+        for ( int blockIndex = 0; blockIndex < blockCount; blockIndex++ )
+        {
+            // Create one new reduce op for each block
+            const int blockSize = std::min(MAX_DIM, ifmShape[reducedAxis] - blockIndex * MAX_DIM);
+            std::unique_ptr<SchedulerOperation> subOp;
+            Kernel kernel;
+            if ( isReduceInH ) kernel = op->Kernel()->WithSize({1 /* W */, blockSize /* H */});
+            else if ( isReduceInW ) kernel = op->Kernel()->WithSize({blockSize /* W */, 1 /* H */});
+            subOp = MakeSubOperation(op.get(), isReduceInC ? nullptr : &kernel);
+
+            auto *subOpIfmConn = subOp->IFM(0);
+            subOpIfmConn->slice.offset = ifmSlice.offset.With(reducedAxis, blockIndex * MAX_DIM);
+            subOpIfmConn->slice.shape = ifmSlice.shape.With(reducedAxis, blockSize);
+            subOpIfmConn->quantization = ifmConn->quantization;
+            auto *subOpOfmConn = subOp->OFM();
+            subOpOfmConn->tensor = newTensor;
+            subOpOfmConn->shape = newTensor->storageShape;
+            subOpOfmConn->slice.offset = ofmSlice.offset.With(reducedAxis, blockIndex);
+            subOpOfmConn->slice.shape = ofmSlice.shape.With(reducedAxis, 1);
+            subOpOfmConn->quantization = ofmConn->quantization;
+            newTensor->producers.push_back(subOp.get());
+
+            LOG_TRACE1("DecomposeReduce: Block, IFM ({}) @ ({}) from ({}), OFM ({}) @ ({}) from ({})\n",
+                subOpIfmConn->slice.shape.ToString(), subOpIfmConn->slice.offset.ToString(), subOpIfmConn->shape.ToString(),
+                subOpOfmConn->slice.shape.ToString(), subOpOfmConn->slice.offset.ToString(), subOpOfmConn->shape.ToString());
+
+            auto subOps = DecomposeReduce(arch, std::move(subOp));
+            result.insert(result.end(), std::make_move_iterator(subOps.begin()), std::make_move_iterator(subOps.end()));
+        }
+
+        // Create one last reduce op that reduces all the blocks
+        std::unique_ptr<SchedulerOperation> subOp;
+        Kernel kernel;
+        if ( isReduceInH ) kernel = op->Kernel()->WithSize({1 /* W */, blockCount /* H */});
+        else if ( isReduceInW ) kernel = op->Kernel()->WithSize({blockCount /* W */, 1 /* H */});
+        subOp = MakeSubOperation(op.get(), isReduceInC ? nullptr : &kernel);
+
+        auto *subOpIfmConn = subOp->IFM(0);
+        subOpIfmConn->tensor = newTensor;
+        subOpIfmConn->shape = newTensor->storageShape;
+        subOpIfmConn->slice.offset = newTensor->storageShape.WithZeros();
+        subOpIfmConn->slice.shape = ifmSlice.shape.With(reducedAxis, blockCount);
+        subOpIfmConn->quantization = Quantization::Unit();
+        newTensor->consumers.push_back(subOp.get());
+        auto *subOpOfmConn = subOp->OFM();
+        subOpOfmConn->quantization = Quantization::Unit();
+
+        LOG_TRACE1("DecomposeReduce: Final block, IFM ({}) @ ({}) from ({}), OFM ({}) @ ({}) from ({})\n",
+            subOpIfmConn->slice.shape.ToString(), subOpIfmConn->slice.offset.ToString(), subOpIfmConn->shape.ToString(),
+            subOpOfmConn->slice.shape.ToString(), subOpOfmConn->slice.offset.ToString(), subOpOfmConn->shape.ToString());
+
+        auto subOps = DecomposeReduce(arch, std::move(subOp));
+        result.insert(result.end(), std::make_move_iterator(subOps.begin()), std::make_move_iterator(subOps.end()));
+        return result;
+    }
+
+    // Handle non-reduced axes
+    for ( int axis = 0; axis < ifmRank; axis++ )
     {
-        if ( ofmShape[axis] > MAX_DIM )
+        // At this point the reduced axis should not be too large
+        assert(ifmShape[axis] <= MAX_DIM || axis != reducedAxis);
+
+        if ( ifmShape[axis] > MAX_DIM )
         {
-            if ( axis == reducedAxis )
-            {
-                // TODO: MLBEDSW-9408 reduced axis requires specific decomposition
-                continue;
-            }
             return DecomposeLargeAxis(axis, MAX_DIM, arch, std::move(op), DecomposeReduce);
         }
     }
diff --git a/ethosu/regor/compiler/scheduler_packing.cpp b/ethosu/regor/compiler/scheduler_packing.cpp
index 69740d3b..2e7b9cbd 100644
--- a/ethosu/regor/compiler/scheduler_packing.cpp
+++ b/ethosu/regor/compiler/scheduler_packing.cpp
@@ -635,6 +635,7 @@ std::unique_ptr<SchedulerOperation> SchedulerPacking::MakeSchedulerOperation(Ope
         int paddedAxes = schedOp->Output(TensorUsage::OFM)->shape.Size() - op->Output(TensorUsage::OFM)->shape.Size();
         assert(paddedAxes >= 0);
         attr->axis += paddedAxes;
+        assert(attr->axis < schedOp->Input(TensorUsage::IFM)->shape.Size());
     }
     // Update OFM transpose mask if operator has the attribute
     else if ( schedOp->HasAttribute<transpose_attr_t>() )
diff --git a/ethosu/regor/test/test_scheduler_decompose.cpp b/ethosu/regor/test/test_scheduler_decompose.cpp
index 7c4d327d..04568fe1 100644
--- a/ethosu/regor/test/test_scheduler_decompose.cpp
+++ b/ethosu/regor/test/test_scheduler_decompose.cpp
@@ -18,6 +18,7 @@
 
 #include "common/common.hpp"
 
+#include "architecture/ethosu85/ethos_u85.hpp"
 #include "compiler/scheduler_decompose.hpp"
 #include "util.hpp"
 
@@ -57,6 +58,8 @@ std::unique_ptr<SchedulerOperation> CreateOperation(OpType opType, Shape ifmShap
 
 TEST_CASE("test_scheduler_decompose")
 {
+    auto arch = CreateArchDefault<ArchEthosU85>(1024);
+
     SECTION("Decompose matmul in height dimension")
     {
         Shape ifmShape(1, 100, 3, 2);  // ifm2 is transposed by graphIR optimiser to same shape as ifm1
@@ -227,14 +230,15 @@ TEST_CASE("test_scheduler_decompose")
         REQUIRE(decomposedOps.size() == 1);
         REQUIRE(orig == decomposedOps[0].get());
     }
-    SECTION("Decompose large axis")
+    SECTION("Decompose reduce large axis (non-reduced axis)")
     {
         uint32_t maxSize = (1UL << 16);
         uint32_t shapeSize = maxSize * 10 + 5;
         Shape ifmShape(1, 1, shapeSize, 5);
         Shape ofmShape(1, 1, shapeSize, 5);
         auto op = CreateOperation(OpType::ReduceMax, ifmShape, ofmShape);
-        std::vector<std::unique_ptr<SchedulerOperation>> decomposedOps = DecomposeReduce(nullptr, std::move(op));
+        op->Attribute<axis_attr_t>()->axis = 1;  // H
+        std::vector<std::unique_ptr<SchedulerOperation>> decomposedOps = DecomposeReduce(arch.get(), std::move(op));
         REQUIRE(decomposedOps.size() == 11);
         for ( size_t i = 0; i < decomposedOps.size(); i++ )
         {
@@ -246,7 +250,7 @@ TEST_CASE("test_scheduler_decompose")
             REQUIRE(ofmSlice.shape == ofmShape.WithWidth(expectedWidth));
         }
     }
-    SECTION("Decompose large axis (sliced)")
+    SECTION("Decompose reduce large axis (non-reduced axis, sliced)")
     {
         uint32_t maxSize = (1UL << 16);
         uint32_t shapeSize = maxSize * 10 + 5;
@@ -263,9 +267,10 @@ TEST_CASE("test_scheduler_decompose")
         Shape ifmSliceShape(1, 1, maxSize * 2 + 7, 10);
         Shape ofmSliceShape(1, 1, maxSize * 2 + 7, 10);
         auto op = CreateOperation(OpType::ReduceMax, ifmShape, ofmShape);
+        op->Attribute<axis_attr_t>()->axis = 1;  // H
         op->Input(TensorUsage::IFM0)->slice = {ifmSliceOffset, ifmSliceShape};
         op->Output(TensorUsage::OFM)->slice = {ofmSliceOffset, ofmSliceShape};
-        std::vector<std::unique_ptr<SchedulerOperation>> decomposedOps = DecomposeReduce(nullptr, std::move(op));
+        std::vector<std::unique_ptr<SchedulerOperation>> decomposedOps = DecomposeReduce(arch.get(), std::move(op));
         REQUIRE(decomposedOps.size() == 3);
         for ( size_t i = 0; i < decomposedOps.size(); i++ )
         {
@@ -279,4 +284,52 @@ TEST_CASE("test_scheduler_decompose")
             REQUIRE(ifmSlice.offset == (ifmSliceOffset + Shape(0, 0, i * maxSize, 0)));
         }
     }
+    SECTION("Decompose reduce large axis (reduced axis)")
+    {
+        int maxSize = (1UL << 16);
+        int shapeSize = maxSize * 10 + 5;
+        Shape ifmShape(1, 1, shapeSize, 5);
+        Shape ofmShape(1, 1, 1, 5);
+        auto op = CreateOperation(OpType::ReduceMax, ifmShape, ofmShape);
+        op->Attribute<axis_attr_t>()->axis = 2;  // W
+        std::vector<std::unique_ptr<SchedulerOperation>> decomposedOps = DecomposeReduce(arch.get(), std::move(op));
+        REQUIRE(decomposedOps.size() == 12);
+        for ( int i = 0; i < int(decomposedOps.size()) - 1; i++ )
+        {
+            // Check each block
+            auto &subOp = decomposedOps[i];
+            auto &ifmSlice = subOp->Input(TensorUsage::IFM0)->slice;
+            auto &ofmSlice = subOp->Output(TensorUsage::OFM)->slice;
+            int blockSize = std::min(maxSize, shapeSize - i * maxSize);
+            REQUIRE(ifmSlice.shape == ifmShape.WithWidth(blockSize));
+            REQUIRE(ofmSlice.shape == ofmShape.WithWidth(1));
+            REQUIRE(ifmSlice.offset == ifmShape.WithZeros().WithWidth(i * maxSize));
+            REQUIRE(ofmSlice.offset == ofmShape.WithZeros().WithWidth(i));
+        }
+        // Check final reduce
+        auto &subOp = decomposedOps.back();
+        auto &ifmSlice = subOp->Input(TensorUsage::IFM0)->slice;
+        auto &ofmSlice = subOp->Output(TensorUsage::OFM)->slice;
+        int blockCount = decomposedOps.size() - 1;
+        REQUIRE(ifmSlice.shape == ifmShape.WithWidth(blockCount));
+        REQUIRE(ofmSlice.shape == ofmShape);
+        REQUIRE(ifmSlice.offset == ifmShape.WithZeros());
+        REQUIRE(ofmSlice.offset == ofmShape.WithZeros());
+    }
+    SECTION("Decompose reduce with batch dimension")
+    {
+        Shape ifmShape(3, 7, 11, 13);
+        Shape ofmShape(3, 7, 11, 1);
+        auto op = CreateOperation(OpType::ReduceMax, ifmShape, ofmShape);
+        op->Attribute<axis_attr_t>()->axis = 3;  // C
+        std::vector<std::unique_ptr<SchedulerOperation>> decomposedOps = DecomposeReduce(arch.get(), std::move(op));
+        REQUIRE(decomposedOps.size() == 1);
+        auto &subOp = decomposedOps[0];
+        auto &ifmSlice = subOp->Input(TensorUsage::IFM0)->slice;
+        auto &ofmSlice = subOp->Output(TensorUsage::OFM)->slice;
+        REQUIRE(ifmSlice.shape == Shape(3 * 7 * 11, 13, 1));
+        REQUIRE(ofmSlice.shape == Shape(3 * 7 * 11, 1, 1));
+        REQUIRE(ifmSlice.offset == Shape(0, 0, 0));
+        REQUIRE(ofmSlice.offset == Shape(0, 0, 0));
+    }
 }
-- 
GitLab