From 65869886e17247f56fb6f8e0c1892427e0982196 Mon Sep 17 00:00:00 2001
From: Fredrik Svedberg <fredrik.svedberg@arm.com>
Date: Mon, 20 Jan 2025 11:02:57 +0100
Subject: [PATCH] MLBEDSW-9697 Add support for TOSA CONV3D

Added initial support for TOSA CONV3D.

Change-Id: Id4de4a11da26a555f05941f08d8f176309fcefda
Signed-off-by: Fredrik Svedberg <fredrik.svedberg@arm.com>
---
 ethosu/regor/architecture/architecture.hpp    |   1 +
 .../ethos_u55_register_cs_generator.cpp       |   1 +
 .../regor/architecture/ethosu85/ethos_u85.cpp |   2 +-
 .../ethos_u85_register_cs_generator.cpp       |  21 +-
 .../high_level_command_stream_generator.cpp   |  10 +-
 ethosu/regor/compiler/scheduler.cpp           |  14 +-
 ethosu/regor/compiler/scheduler.hpp           |   5 +-
 ethosu/regor/compiler/scheduler_decompose.cpp | 213 ++++++++++++++++--
 ethosu/regor/compiler/scheduler_decompose.hpp |   1 +
 ethosu/regor/compiler/scheduler_operation.hpp |  17 ++
 ethosu/regor/compiler/scheduler_packing.cpp   |   3 +
 .../test/test_fast_storage_allocator.cpp      |   1 +
 .../regor/tflite/custom_operator_ethosu.hpp   |   5 +-
 ethosu/regor/tosa/tosa_reader.cpp             |   7 +-
 14 files changed, 265 insertions(+), 36 deletions(-)

diff --git a/ethosu/regor/architecture/architecture.hpp b/ethosu/regor/architecture/architecture.hpp
index 7546e762..cdf3e104 100644
--- a/ethosu/regor/architecture/architecture.hpp
+++ b/ethosu/regor/architecture/architecture.hpp
@@ -202,6 +202,7 @@ struct ArchitectureConfigQuery
     Shape ofmShape;
     Shape ifmShape[2];
     int ifmBits;
+    int ofmBits;
     Kernel *kernel;
     int lutBytes;
     bool scaled;
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp
index 11469d1e..332bb96c 100644
--- a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp
@@ -1447,6 +1447,7 @@ void EthosU55RCSGenerator::InsertTransposeCommand(const HLCStripe *stripe, Tempo
                     ArchitectureConfigQuery query{};
                     query.kernel = &cmd->operation->kernel;
                     query.ifmBits = DataTypeSizeBits(ifm.dataType);
+                    query.ofmBits = DataTypeSizeBits(ofm.dataType);
                     query.ifmShape[0] = inFM.shape;
                     query.ofmShape = outFM.shape;
                     query.ofmFormat = TensorFormat::NHWC;
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85.cpp
index 28e23e70..1f3907b7 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85.cpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85.cpp
@@ -912,7 +912,7 @@ std::unique_ptr<ArchitectureOpConfig> ArchEthosU85::FindBlockConfig(OpType opTyp
     // Accumulator settings
     EthosU85Accumulator accType = EthosU85Accumulator::Acc32;
     if ( (query.ifmBits == 16 && !isPooling && query.scaled) ||  // Normal 16-bit selection
-         (query.ifmBits > 32) )                                  // Special case for Rescale int48
+         (query.ifmBits > 32) || (query.ofmBits > 32) )          // Special case for Rescale int48
     {
         accType = EthosU85Accumulator::Acc48;
     }
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp
index 9412c0b0..c2f352b8 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp
@@ -1470,6 +1470,10 @@ void EthosU85RCSGenerator::GenerateAccFormat(const HLCStripe *stripe)
     {
         accType = config->Acc();
         accSrc = config->AccSource();
+        assert(
+            accSrc != ArchAccumulatorSource::Ifm2 ||
+            (stripe->operation->ifm[1].dataType == DataType::Int32 && accType == EthosU85Accumulator::Acc32) ||
+            (stripe->operation->ifm[1].dataType == DataType::Int64 && accType == EthosU85Accumulator::Acc48));
     }
 
     acc_format format = accType == EthosU85Accumulator::Acc32 ? acc_format::I32 : acc_format::I48;
@@ -1694,8 +1698,9 @@ void EthosU85RCSGenerator::GenerateOperationCode(const HLCOperation *op)
     }
     else if ( IsConvolution(opType) || IsVectorProduct(opType) )
     {
-        // Dynamic weights when op->ifm.size() == 2, _weights_ifm2 parameter should be True
-        Emit(isa::npu_op_conv_t(op->ifm.size() == 2));
+        // Dynamic weights when op->ifm.size() == 2 and acc source != ifm2, _weights_ifm2 parameter should be True
+        auto accSource = static_cast<EthosU85OpConfig *>(op->config)->AccSource();
+        Emit(isa::npu_op_conv_t(op->ifm.size() == 2 && accSource != ArchAccumulatorSource::Ifm2));
     }
     else if ( IsElementwise(opType) )
     {
@@ -1777,18 +1782,22 @@ void EthosU85RCSGenerator::GenerateCommon(const HLCStripe *stripe, bool useGloba
 void EthosU85RCSGenerator::GenerateConvolutionOp(const HLCStripe *stripe, MemoryAccesses &memoryAccesses)
 {
     auto op = stripe->operation.get();
+    EthosU85OpConfig *config = static_cast<EthosU85OpConfig *>(op->config);
     QuantizedScale ofmScale(1, 0);
     bool useGlobalScale = false;
     ethosU85Scaling::RescaleConvolution(op);
 
     if ( op->ifm.size() == 2 )
     {
-        // Dynamic weights
-        assert(ToActivationPrecision(op->ifm[0].dataType) == ToActivationPrecision(op->ifm[1].dataType));
-        useGlobalScale = true;
         GenerateIFM2Precision(op->ifm[1], false, false);
         GenerateIFM2(op->type, op->ifm[1], stripe->ifmAreas[1], false, 0, -1);
-        Emit(isa::npu_set_weight_format_t(weight_format::SWD, weight_sparsity::NONE));  // Reset weight format
+        if ( config->AccSource() != ArchAccumulatorSource::Ifm2 )
+        {
+            // Dynamic weights
+            assert(ToActivationPrecision(op->ifm[0].dataType) == ToActivationPrecision(op->ifm[1].dataType));
+            useGlobalScale = true;
+            Emit(isa::npu_set_weight_format_t(weight_format::SWD, weight_sparsity::NONE));  // Reset weight format
+        }
     }
 
     if ( !op->ofm.quantization.scales.empty() )
diff --git a/ethosu/regor/compiler/high_level_command_stream_generator.cpp b/ethosu/regor/compiler/high_level_command_stream_generator.cpp
index 9b9b5863..c73a0891 100644
--- a/ethosu/regor/compiler/high_level_command_stream_generator.cpp
+++ b/ethosu/regor/compiler/high_level_command_stream_generator.cpp
@@ -60,7 +60,7 @@ enum class TransformLimit
 static Box TransformWithStridesAndSkirt(const Box &outputArea, const Shape *strides, const Point2i &inputStep,
     const HLCPadding *skirt, const Shape &ifmShape, OpType opType, const Shape &concatOffsets, const Shape &splitOffset,
     const Shape &splitShape, int dilatedKernelHeight, int upscalingFactor, int &padTop, int &padBottom,
-    TransformLimit limit = TransformLimit::None, TransposeType transposeType = TransposeType::None)
+    TransformLimit limit = TransformLimit::None, TransposeType transposeType = TransposeType::None, bool accIfm = false)
 {
     Shape outputAreaStart = outputArea.Start().Unpermute(uint32_t(transposeType));
     Shape outputAreaEnd = outputArea.End().Unpermute(uint32_t(transposeType));
@@ -109,6 +109,9 @@ static Box TransformWithStridesAndSkirt(const Box &outputArea, const Shape *stri
         start = splitOffset;
         end = start + splitShape;
     }
+
+    if ( accIfm ) return Box(start, end);
+
     end = Shape::Min(end, Shape::Max(ifmShape, Shape(1, 1, 1, 1)).WithHW(ifmShape.Height() * upscalingFactor, ifmShape.Width() * upscalingFactor));
     padTop = 0;
     padBottom = 0;
@@ -170,7 +173,7 @@ static Box TransformWithStridesAndSkirt(const Box &outputArea, const Shape *stri
 }
 
 static std::pair<Box, HLCPadding> TransformWithInputOutputSteps(const Box &inputArea, const Point2i &inputStep,
-    const Box &outputArea, const Point2i &outputStep, class Kernel *kernel, const HLCPadding &padding, const Shape &ifmShape)
+    const Box &outputArea, const Point2i &outputStep, const Kernel *kernel, const HLCPadding &padding, const Shape &ifmShape)
 {
     const auto &stride = kernel->Stride();
     const auto dilatedWH = kernel->DilatedWH();
@@ -663,10 +666,11 @@ void HLCStreamGenerator::GenerateHLCStripeCommands(SchedulerOperation *op, const
                 {
                     if ( !IsIFM(fm.usage) ) continue;
                     auto ifmConn = op->Input(fm.usage);
+                    bool accIfm = op->AccumulatorMode().source == AccumulatorSource::Ifm2 && fm.usage == TensorUsage::IFM1;
                     // Calculate input area based on the output area
                     auto inputArea = TransformWithStridesAndSkirt(outputArea, &strides, ifmConn->stepXY, &skirt, ifmConn->shape,
                         opType, ofmConn->slice.offset, ifmConn->slice.offset, ifmConn->slice.shape, dilatedKernelHeight,
-                        upscaling, hlcStripe->padding.top, hlcStripe->padding.bottom, ifmLimit, ofmConn->transpose);
+                        upscaling, hlcStripe->padding.top, hlcStripe->padding.bottom, ifmLimit, ofmConn->transpose, accIfm);
                     if ( ofmConn->stepXY != Point2i{1, 1} || ifmConn->stepXY != Point2i{1, 1} )
                     {
                         std::tie(inputArea, hlcStripe->padding) = TransformWithInputOutputSteps(inputArea,
diff --git a/ethosu/regor/compiler/scheduler.cpp b/ethosu/regor/compiler/scheduler.cpp
index 9a5ea29a..40241bb5 100644
--- a/ethosu/regor/compiler/scheduler.cpp
+++ b/ethosu/regor/compiler/scheduler.cpp
@@ -391,6 +391,7 @@ std::unique_ptr<ArchitectureOpConfig> GetOpConfig(Architecture *arch, SchedulerO
     query.ifmShape[0] = ifmShape;
     query.ifmShape[1] = ifm2Shape;
     query.ifmBits = DataTypeSizeBits(ifm->tensor->dataType);
+    query.ofmBits = DataTypeSizeBits(ofm->tensor->dataType);
     query.kernel = op->Kernel();
     query.lutBytes = op->TryInput(TensorUsage::LUT) ? 2048 : 0;
     query.scaled = op->HasScaling();
@@ -680,7 +681,8 @@ std::unique_ptr<SchedulerOpInfo> Scheduler::CreateSchedulerOpInfo(
     {
         blockConfig = parentInfo ? parentInfo->Config()->Clone() : GetOpConfig(_arch, op, ifmShape, ifm2Shape, ofmShape, weightFormat);
     }
-    if ( !weights && op->OFM()->quantization.scales.size() > 1 )
+    auto scales = op->TryInput(TensorUsage::Scales);
+    if ( !weights && (op->OFM()->quantization.scales.size() > 1 || scales) )
     {
         WeightsRef weightsRef;
         weightsRef.isScales = true;
@@ -690,7 +692,8 @@ std::unique_ptr<SchedulerOpInfo> Scheduler::CreateSchedulerOpInfo(
         auto encodingParams = _arch->WeightEncoder()->GetEncodingConfig(
             blockConfig.get(), weightsRef, op->Kernel(), ifm->tensor->dataType, depthOffsets, weightFormat);
 
-        weightScales = EncodeQuantizationScaleTensor(std::move(encodingParams), op->OFM()->quantization);
+        const SchedulerTensor *scaleTensor = scales ? scales->tensor.get() : nullptr;
+        weightScales = EncodeQuantizationScaleTensor(std::move(encodingParams), op->OFM()->quantization, scaleTensor);
     }
     // Finally construct and populate operator information (cost)
     auto opInfo = std::make_unique<SchedulerOpInfo>(std::move(blockConfig), ifmShape, ifm2Shape, ofmShape);
@@ -1852,12 +1855,13 @@ static int ApplyZeroPointOHWI(const WeightTransformParam *param, int value)
     return value;
 }
 
-WeightScaleTensors Scheduler::EncodeQuantizationScaleTensor(std::unique_ptr<IWeightEncodingConfig> encodingParams, Quantization &ofmQuantization)
+WeightScaleTensors Scheduler::EncodeQuantizationScaleTensor(std::unique_ptr<IWeightEncodingConfig> encodingParams,
+    const Quantization &ofmQuantization, const SchedulerTensor *scales)
 {
     SchedulerTensor scaleTens;
     scaleTens.dataType = DataType::Int32;
-
-    return TryEncodeWeightAndScaleTensor(encodingParams.get(), nullptr, &scaleTens, {}, ofmQuantization, false, true);
+    if ( scales == nullptr ) scales = &scaleTens;
+    return TryEncodeWeightAndScaleTensor(encodingParams.get(), nullptr, scales, {}, ofmQuantization, false, true);
 }
 
 WeightScaleTensors Scheduler::EncodeWeightAndScaleTensor(std::unique_ptr<IWeightEncodingConfig> encodingParams, const SchedulerTensor *weightTens,
diff --git a/ethosu/regor/compiler/scheduler.hpp b/ethosu/regor/compiler/scheduler.hpp
index 1aff6c59..2abc634c 100644
--- a/ethosu/regor/compiler/scheduler.hpp
+++ b/ethosu/regor/compiler/scheduler.hpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2021-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -351,7 +351,8 @@ private:
 
     void PrintSchedule(Schedule *schedule);
 
-    WeightScaleTensors EncodeQuantizationScaleTensor(std::unique_ptr<IWeightEncodingConfig> encodingParams, Quantization &ofmQuantization);
+    WeightScaleTensors EncodeQuantizationScaleTensor(std::unique_ptr<IWeightEncodingConfig> encodingParams,
+        const Quantization &ofmQuantization, const SchedulerTensor *scales = nullptr);
 
     WeightScaleTensors EncodeWeightAndScaleTensor(std::unique_ptr<IWeightEncodingConfig> encodingParams, const SchedulerTensor *weightTens,
         const SchedulerTensor *scaleTens, const Quantization &weightQuantization, const Quantization &ofmQuantization);
diff --git a/ethosu/regor/compiler/scheduler_decompose.cpp b/ethosu/regor/compiler/scheduler_decompose.cpp
index f5292271..947f7821 100644
--- a/ethosu/regor/compiler/scheduler_decompose.cpp
+++ b/ethosu/regor/compiler/scheduler_decompose.cpp
@@ -118,11 +118,12 @@ static std::unique_ptr<SchedulerOperation> MakeTransposeOp(
     return op;
 }
 
-static std::unique_ptr<SchedulerOperation> MakeSubOperation(const SchedulerOperation *schedOp, const Kernel *newKernel = nullptr)
+static std::unique_ptr<SchedulerOperation>
+MakeSubOperation(const SchedulerOperation *schedOp, const Kernel *newKernel = nullptr, OpType type = OpType::None)
 {
     assert(schedOp->SubOps().empty());
     assert(schedOp->Parent() == nullptr);
-    auto subOp = std::make_unique<SchedulerOperation>(schedOp->Type());
+    auto subOp = std::make_unique<SchedulerOperation>(type != OpType::None ? type : schedOp->Type());
     subOp->SetKernel(newKernel ? newKernel : schedOp->Kernel());
     subOp->SetHasScaling(schedOp->HasScaling());
     subOp->_srcKey = schedOp->_srcKey;
@@ -179,6 +180,7 @@ static std::unique_ptr<ArchitectureOpConfig> GetOpConfig(Architecture *arch, con
         qConfig.ifmShape[1] = ifm1->SliceShape();
     }
     qConfig.ifmBits = DataTypeSizeBits(ifm->tensor->dataType);
+    qConfig.ofmBits = DataTypeSizeBits(ofm->tensor->dataType);
     qConfig.kernel = schedOp->Kernel();
     qConfig.lutBytes = schedOp->TryInput(TensorUsage::LUT) ? 2048 : 0;
     qConfig.scaled = schedOp->HasScaling();
@@ -264,6 +266,7 @@ bool CanRunOnHardware(Architecture *arch, const SchedulerOperation *schedOp)
 bool CanDecompose(Architecture *, const SchedulerOperation *schedOp)
 {
     if ( schedOp->Type() == OpType::Conv2D ) return true;
+    if ( schedOp->Type() == OpType::Conv3D ) return true;
     if ( schedOp->Type() == OpType::DepthwiseConv2D ) return true;
     if ( schedOp->Type() == OpType::TransposeConv2D ) return true;
     if ( DecomposeAsElementwise(schedOp->Type()) || schedOp->Type() == OpType::MemoryCopy ) return true;
@@ -487,6 +490,9 @@ template<typename SRC_TYPE, typename DST_TYPE = SRC_TYPE>
 static std::shared_ptr<SchedulerTensor>
 SliceT(SchedulerTensor *tensor, const Shape &offset, const Shape &shape, const Shape &readShape, const Point2i &stepXY)
 {
+    constexpr int MAX_RANK = 5;
+    assert(shape.Size() <= MAX_RANK);
+    assert(offset.Size() <= MAX_RANK);
     auto paddedInShape = Shape::PadAxes(readShape ? readShape : tensor->bufferView.ViewShape(), shape.Size(), 1);
     const auto &inBufferView = tensor->bufferView.Reshape(paddedInShape).SubView(offset, shape);
     const auto &inBufferValues = inBufferView.Values<SRC_TYPE, DST_TYPE>();
@@ -498,21 +504,21 @@ SliceT(SchedulerTensor *tensor, const Shape &offset, const Shape &shape, const S
     auto outBufferValues = outBufferView.WritableValues<DST_TYPE>();
 
     // Copy values into the output buffer
-    auto paddedOutShape = Shape::PadAxes(shape, 4, 1);
-    int batch = paddedOutShape.Batch();
-    int height = paddedOutShape.Height();
-    int width = paddedOutShape.Width();
-    int depth = paddedOutShape.Depth();
-    for ( int n = 0; n < batch; n++ )
+    auto paddedOutShape = Shape::PadAxes(shape, MAX_RANK, 1);
+    auto ndhwc = paddedOutShape.WithZeros();
+    for ( ndhwc[0] = 0; ndhwc[0] < paddedOutShape[0]; ndhwc[0]++ )
     {
-        for ( int h = 0; h < height; h += stepXY.y )
+        for ( ndhwc[1] = 0; ndhwc[1] < paddedOutShape[1]; ndhwc[1]++ )
         {
-            for ( int w = 0; w < width; w += stepXY.x )
+            for ( ndhwc[2] = 0; ndhwc[2] < paddedOutShape[2]; ndhwc[2] += stepXY.y )
             {
-                for ( int c = 0; c < depth; c++ )
+                for ( ndhwc[3] = 0; ndhwc[3] < paddedOutShape[3]; ndhwc[3] += stepXY.x )
                 {
-                    Shape pos({n, h, w, c}, shape.Size());
-                    outBufferValues[pos] = inBufferValues[pos];
+                    for ( ndhwc[4] = 0; ndhwc[4] < paddedOutShape[4]; ndhwc[4]++ )
+                    {
+                        Shape pos(ndhwc, shape.Size());
+                        outBufferValues[pos] = inBufferValues[pos];
+                    }
                 }
             }
         }
@@ -748,6 +754,7 @@ DecomposeForStrides(Architecture *arch, std::unique_ptr<SchedulerOperation> op,
             auto weightStepXY = Point2i{SX, SY};
             auto newKernel = kernel->WithStride({1, 1}).WithSize({newWidth, newHeight});
             std::unique_ptr<SchedulerOperation> subOp = MakeSubOperation(op.get(), &newKernel);
+            subOp->RemoveInput(TensorUsage::IFM1);  // Remove acc input
             auto *subIfmConn = subOp->Input(TensorUsage::IFM);
             subIfmConn->slice = std::move(newIfmSlice);
             subIfmConn->stepXY = ifmStrides;
@@ -768,8 +775,15 @@ DecomposeForStrides(Architecture *arch, std::unique_ptr<SchedulerOperation> op,
     accMode.outputEnabled = true;
     result.back()->SetAccumulatorMode(accMode);
     accMode = result.front()->AccumulatorMode();
-    accMode.source = AccumulatorSource::Reset;
+    accMode.source = op->AccumulatorMode().source;
     result.front()->SetAccumulatorMode(accMode);
+    // Reconnect acc input
+    if ( accMode.source == AccumulatorSource::Ifm2 )
+    {
+        auto subOpIfm2 = result.front()->AddInput(TensorUsage::IFM1);
+        *subOpIfm2 = *op->Input(TensorUsage::IFM1);
+        subOpIfm2->tensor->consumers.push_back(result.front().get());
+    }
     return result;
 }
 
@@ -827,6 +841,177 @@ std::vector<std::unique_ptr<SchedulerOperation>> DecomposeConv2D(Architecture *a
     return result;
 }
 
+std::vector<std::unique_ptr<SchedulerOperation>> DecomposeConv3D(Architecture *arch, std::unique_ptr<SchedulerOperation> op)
+{
+    std::vector<std::unique_ptr<SchedulerOperation>> result;
+    auto *ofmConn = op->Output(TensorUsage::OFM);
+    auto *ifmConn = op->Input(TensorUsage::IFM);
+    auto *weightsConn = op->Input(TensorUsage::Weights);
+    const auto &ofmShape = ofmConn->SliceShape();
+    const auto &ifmShape = ifmConn->SliceShape();
+    auto &ofmSlice = ofmConn->slice;
+    auto &ifmSlice = ifmConn->slice;
+    auto *kernel = op->Kernel();
+    auto &padding = kernel->Padding();
+    ofmSlice.Initialize(ofmShape.WithZeros(), ofmShape);
+    ifmSlice.Initialize(ifmShape.WithZeros(), ifmShape);
+
+    if ( ofmShape[0] > 1 )  // Batch
+    {
+        return DecomposeLeadingDimensions(1, arch, std::move(op), DecomposeConv3D);
+    }
+    const int OD = ofmSlice.shape[1];
+    const int ID = ifmSlice.shape[1];
+    const int KD = kernel->Size3D().z;
+    if ( (arch->Constraints()->SupportsAccumulatorSaveRestore() || KD == 1) && weightsConn->tensor->IsConstant() )
+    {
+        auto InitConnection = [](SchedulerConnection *dst, SchedulerConnection *src, int dOffset, int dSize)
+        {
+            dst->shape = Shape(src->SliceShape(), 4);
+            // Handle batch
+            dst->shape[0] *= src->shape[0];
+            dst->slice.offset = dst->shape.WithZeros().WithBatch(src->slice.offset[0] * dSize + dOffset);
+            dst->slice.shape = dst->shape.WithBatch(1);
+        };
+        // Create SchedulerTensor for ACC
+        auto acc = std::make_shared<SchedulerTensor>();
+        acc->memArea = ofmConn->tensor->memArea;
+        acc->dataType = ifmConn->tensor->dataType == DataType::Int16 ? DataType::Int64 : DataType::Int32;
+        acc->storageShape = Shape(ofmShape, 4).WithBatch(1);
+        acc->uid = acc->equivalenceId = GenerateUniqueId();
+        const auto ifm0uid = GenerateUniqueId();
+        for ( int od = 0; od < OD; od++ )
+        {
+            std::vector<std::unique_ptr<SchedulerOperation>> conv2dSubOps;
+            for ( int kd = 0; kd < KD; kd++ )
+            {
+                const int id = od * kernel->Stride3D().z - padding.Near() + kd * kernel->Dilation3D().z;
+                if ( id >= 0 && id < ID )
+                {
+                    auto subOp = MakeSubOperation(op.get(), nullptr, OpType::Conv2D);
+                    InitConnection(subOp->Output(TensorUsage::OFM), ofmConn, od, OD);
+                    InitConnection(subOp->Input(TensorUsage::IFM), ifmConn, id, ID);
+                    // Update slice offset for DecomposeConv2D pad handling
+                    auto subOpIfm = subOp->Input(TensorUsage::IFM);
+                    subOpIfm->slice.offset = subOpIfm->slice.offset.WithHW(-padding.Top(), -padding.Left());
+
+                    auto subOpWeights = subOp->Input(TensorUsage::Weights);
+                    if ( KD > 1 )
+                    {
+                        auto offset = subOpWeights->shape.WithZeros().With(1, kd);
+                        subOpWeights->tensor = Slice(subOpWeights->tensor.get(), offset, subOpWeights->shape.With(1, 1));
+                        subOpWeights->tensor->consumers.push_back(subOp.get());
+                    }
+                    // New weight shape
+                    auto subOpWeightShape = subOpWeights->shape.Erase(1);
+                    subOpWeights->shape = subOpWeights->tensor->storageShape = subOpWeightShape;
+                    subOpWeights->tensor->bufferView = subOpWeights->tensor->bufferView.Reshape(subOpWeightShape);
+
+                    conv2dSubOps.emplace_back(std::move(subOp));
+                }
+            }
+            if ( conv2dSubOps.empty() )
+            {
+                // Kernel in padding only area, need to broadcast bias to OFM,
+                // using Rescale with bias and ifm zero point as input
+                auto unitKernel = Kernel::UnitKernel();
+                auto subOp = MakeSubOperation(op.get(), &unitKernel, OpType::Rescale);
+                subOp->RemoveInput(TensorUsage::Weights);
+                InitConnection(subOp->Output(TensorUsage::OFM), ofmConn, od, OD);
+
+                // Create SchedulerTensor for 0 input
+                auto subOpIfm = subOp->Input(TensorUsage::IFM);
+                auto ifm0 = std::make_shared<SchedulerTensor>();
+                ifm0->dataType = subOpIfm->tensor->dataType;
+                ifm0->memArea = subOp->Input(TensorUsage::Scales)->tensor->memArea;
+                ifm0->format = TensorFormat::NHWC;
+                const auto &ifm0shape = subOp->Output(TensorUsage::OFM)->slice.shape;
+                const auto bufSize = ifm0shape.Elements();
+                const int64_t ifmZp = subOpIfm->quantization.zeroPoints.empty() ? 0 : subOpIfm->quantization.zeroPoints.front();
+                std::shared_ptr<Buffer> ifm0buf;
+                switch ( ifm0->dataType )
+                {
+                    case DataType::Int8:
+                        ifm0buf = std::make_shared<Buffer>(std::vector<int8_t>(bufSize, int8_t(ifmZp)));
+                        break;
+                    case DataType::Int16:
+                        ifm0buf = std::make_shared<Buffer>(std::vector<int16_t>(bufSize, int16_t(ifmZp)));
+                        break;
+                    default:
+                        assert(false && "Unsupported ifm data type");
+                        break;
+                }
+                ifm0->bufferView = BufferView(ifm0buf, 0, DataTypeStorageSizeBits(ifm0->dataType), ifm0shape, {});
+                ifm0->storageShape = ifm0->bufferView.ViewShape();
+                ifm0->uid = ifm0->equivalenceId = ifm0uid;
+
+                subOpIfm->tensor = std::move(ifm0);
+                subOpIfm->tensor->consumers.push_back(subOp.get());
+                subOpIfm->shape = ifm0shape;
+                subOpIfm->slice.offset = ifm0shape.WithZeros();
+                subOpIfm->slice.shape = ifm0shape;
+
+                // TODO: MLBEDSW-9759 Pooling Decomposition
+                result.emplace_back(std::move(subOp));
+            }
+            else if ( conv2dSubOps.size() > 1 )
+            {
+                auto &tail = conv2dSubOps.back();
+                auto bias = tail->Input(TensorUsage::Scales);
+
+                // Create SchedulerTensor for 0 (no) bias
+                auto bias0 = std::make_shared<SchedulerTensor>(*bias->tensor);
+                auto bias0buf = std::make_shared<Buffer>(std::make_unique<int64_t>(0));
+                assert(DataTypeStorageSizeBits(bias0->dataType) <= int(8 * sizeof(int64_t)));
+                bias0->bufferView = BufferView(bias0buf, 0, DataTypeStorageSizeBits(bias0->dataType), {1}, {});
+                bias0->storageShape = bias0->bufferView.ViewShape();
+                bias0->uid = bias0->equivalenceId = GenerateUniqueId();
+                bias0->consumers.clear();
+
+                for ( auto subOp = conv2dSubOps.begin(); subOp != conv2dSubOps.end(); ++subOp )
+                {
+                    if ( subOp != conv2dSubOps.begin() )
+                    {
+                        // Acc source ifm2 for all but first subop
+                        (*subOp)->AddInput(TensorUsage::IFM1, acc)->shape = acc->storageShape;
+                        (*subOp)->SetAccumulatorMode({AccumulatorSource::Ifm2, true});
+                    }
+                    if ( *subOp != tail )
+                    {
+                        // Remove scaling and bias and set ofm = acc tensor
+                        // (used as acc input for next op) for all but last subop
+                        auto subOpOfm = (*subOp)->OFM();
+                        auto subOpIfm = (*subOp)->IFM(0);
+                        auto subOpWeights = (*subOp)->Input(TensorUsage::Weights);
+                        auto subOpBias = (*subOp)->Input(TensorUsage::Scales);
+                        subOpOfm->tensor = acc;
+                        subOpOfm->tensor->producers.push_back((*subOp).get());
+                        subOpOfm->shape = acc->storageShape;
+                        subOpOfm->slice.offset = subOpOfm->shape.WithZeros();
+                        subOpOfm->quantization.scales = {QuantizedScale::Unit()};
+                        subOpIfm->quantization.scales = {QuantizedScale::Unit()};
+                        subOpWeights->quantization.scales = {QuantizedScale::Unit()};
+                        subOpBias->tensor->RemoveReader((*subOp).get());
+                        subOpBias->tensor = bias0;
+                        subOpBias->tensor->consumers.push_back((*subOp).get());
+                        subOpBias->shape = bias0->storageShape;
+                    }
+                }
+            }
+            auto end = std::make_move_iterator(conv2dSubOps.end());
+            for ( auto subOp = std::make_move_iterator(conv2dSubOps.begin()); subOp != end; ++subOp )
+            {
+                auto subOps = DecomposeConv2D(arch, *subOp);
+                result.insert(result.end(), std::make_move_iterator(subOps.begin()), std::make_move_iterator(subOps.end()));
+            }
+        }
+        return result;
+    }
+    // If we get here, decomposition has failed, the resulting operations will be executed on CPU
+    result.emplace_back(std::move(op));
+    return result;
+}
+
 std::vector<std::unique_ptr<SchedulerOperation>> DecomposeDepthwiseConv2D(Architecture *arch, std::unique_ptr<SchedulerOperation> op)
 {
     std::vector<std::unique_ptr<SchedulerOperation>> result;
diff --git a/ethosu/regor/compiler/scheduler_decompose.hpp b/ethosu/regor/compiler/scheduler_decompose.hpp
index 81c16930..76e4c99c 100644
--- a/ethosu/regor/compiler/scheduler_decompose.hpp
+++ b/ethosu/regor/compiler/scheduler_decompose.hpp
@@ -36,6 +36,7 @@ bool NeedsDecompose(Architecture *arch, const SchedulerOperation *schedOp);
 bool CanRunOnHardware(Architecture *arch, const SchedulerOperation *schedOp);
 bool CanDecompose(Architecture *arch, const SchedulerOperation *schedOp);
 std::vector<std::unique_ptr<SchedulerOperation>> DecomposeConv2D(Architecture *arch, std::unique_ptr<SchedulerOperation> op);
+std::vector<std::unique_ptr<SchedulerOperation>> DecomposeConv3D(Architecture *arch, std::unique_ptr<SchedulerOperation> op);
 std::vector<std::unique_ptr<SchedulerOperation>> DecomposeDepthwiseConv2D(Architecture *arch, std::unique_ptr<SchedulerOperation> op);
 std::vector<std::unique_ptr<SchedulerOperation>> DecomposeTransposeConv2D(Architecture *arch, std::unique_ptr<SchedulerOperation> op);
 std::vector<std::unique_ptr<SchedulerOperation>> DecomposeElementwise(Architecture *arch, std::unique_ptr<SchedulerOperation> op);
diff --git a/ethosu/regor/compiler/scheduler_operation.hpp b/ethosu/regor/compiler/scheduler_operation.hpp
index f1462d07..1be755d5 100644
--- a/ethosu/regor/compiler/scheduler_operation.hpp
+++ b/ethosu/regor/compiler/scheduler_operation.hpp
@@ -71,6 +71,12 @@ public:
         this->uid = GenerateUniqueId();
     }
 
+    void RemoveReader(const SchedulerOperation *op)
+    {
+        auto end = std::remove(consumers.begin(), consumers.end(), op);
+        consumers.erase(end, consumers.end());
+    }
+
     void SetAddress(Address address)
     {
         assert(allocatedAddress == -1 && address >= 0);
@@ -225,6 +231,17 @@ public:
     SchedulerConnection *IFM(int index) { return &inputs.at(MakeTensorUsage(TensorUsage::IFM, index)); }
     const SchedulerConnection *IFM(int index) const { return &inputs.at(MakeTensorUsage(TensorUsage::IFM, index)); }
 
+    // Invalidates all pointers to input connections.
+    void RemoveInput(TensorUsage usage)
+    {
+        auto inputConnection = inputs.try_ref(usage);
+        if ( inputConnection )
+        {
+            if ( inputConnection->tensor ) inputConnection->tensor->RemoveReader(this);
+            inputs.erase(usage);
+        }
+    }
+
     // Output connections
     SchedulerConnection *AddOutput(TensorUsage usage) { return &outputs[usage]; }
 
diff --git a/ethosu/regor/compiler/scheduler_packing.cpp b/ethosu/regor/compiler/scheduler_packing.cpp
index dfa3c657..53aa06c0 100644
--- a/ethosu/regor/compiler/scheduler_packing.cpp
+++ b/ethosu/regor/compiler/scheduler_packing.cpp
@@ -558,6 +558,9 @@ std::vector<std::unique_ptr<SchedulerOperation>> SchedulerPacking::DecomposeSche
         case OpType::Conv2D:
             result = DecomposeConv2D(_arch, std::move(op));
             break;
+        case OpType::Conv3D:
+            result = DecomposeConv3D(_arch, std::move(op));
+            break;
         case OpType::DepthwiseConv2D:
             result = DecomposeDepthwiseConv2D(_arch, std::move(op));
             break;
diff --git a/ethosu/regor/test/test_fast_storage_allocator.cpp b/ethosu/regor/test/test_fast_storage_allocator.cpp
index fe1a2d1f..bec7d6f1 100644
--- a/ethosu/regor/test/test_fast_storage_allocator.cpp
+++ b/ethosu/regor/test/test_fast_storage_allocator.cpp
@@ -76,6 +76,7 @@ static std::unique_ptr<Schedule> CreateSchedule(std::unique_ptr<Architecture> &a
         ArchitectureConfigQuery query{};
         query.kernel = op->Kernel();
         query.ifmBits = DataTypeSizeBits(ifm->tensor->dataType);
+        query.ofmBits = DataTypeSizeBits(ofm->tensor->dataType);
         query.ifmShape[0] = ifm->shape;
         query.ofmShape = ofm->shape;
         query.transpose = TransposeType::None;
diff --git a/ethosu/regor/tflite/custom_operator_ethosu.hpp b/ethosu/regor/tflite/custom_operator_ethosu.hpp
index ec4c7eba..f9ddbeb1 100644
--- a/ethosu/regor/tflite/custom_operator_ethosu.hpp
+++ b/ethosu/regor/tflite/custom_operator_ethosu.hpp
@@ -220,14 +220,15 @@ private:
         {
             const auto offset = tensor->AllocatedAddress();
             const auto allocation = tensor->AllocationSizeBytes();
-            const auto size = tensor->srcTensor->View().Buffer()->Size();
+            const auto buffer = tensor->srcTensor ? tensor->srcTensor->View().Buffer() : tensor->bufferView.Buffer();
+            const auto size = buffer->Size();
 
             assert(tensor->memArea.usage % MemUsage::ReadOnly);
             assert((offset >= 0) && (allocation >= 0));                         // Has been allocated
             assert((offset + allocation) <= Address(_readOnlyBuffer->Size()));  // Allocation fits in buffer
             assert(size <= allocation);                                         // Tensor fits in allocation
 
-            std::copy_n(tensor->srcTensor->View().Buffer()->Data<uint8_t>(), size, _readOnlyBuffer->Data<uint8_t>() + offset);
+            std::copy_n(buffer->Data<uint8_t>(), size, _readOnlyBuffer->Data<uint8_t>() + offset);
         }
     }
 };
diff --git a/ethosu/regor/tosa/tosa_reader.cpp b/ethosu/regor/tosa/tosa_reader.cpp
index 582bf4db..75afcc69 100644
--- a/ethosu/regor/tosa/tosa_reader.cpp
+++ b/ethosu/regor/tosa/tosa_reader.cpp
@@ -445,9 +445,10 @@ void TosaReader::LoadGraphs(const tosaFb::TosaGraph *model, std::list<GraphBuild
                         kernelPtr = &kernel;
                         tosa_assert(input_tensors.size() > 1);
                         const auto &shape = shapes.at(input_tensors[1]);
-                        kernel.sizeYXZ[0] = shape.axisNHWC[1];
-                        kernel.sizeYXZ[1] = shape.axisNHWC[2];
-                        kernel.sizeYXZ[2] = shape.axisNHWC[0];
+                        tosa_assert(shape.count == 5);
+                        kernel.sizeYXZ[0] = shape.axisNHWC[2];
+                        kernel.sizeYXZ[1] = shape.axisNHWC[3];
+                        kernel.sizeYXZ[2] = shape.axisNHWC[1];
                         const auto &attr = TosaAttr<tosaFb::Op::CONV3D>::Get(tosa_operator);
                         tosa_assert(attr.pad());
                         tosa_assert(attr.pad()->size() == 6);
-- 
GitLab