From 65869886e17247f56fb6f8e0c1892427e0982196 Mon Sep 17 00:00:00 2001 From: Fredrik Svedberg Date: Mon, 20 Jan 2025 11:02:57 +0100 Subject: [PATCH] MLBEDSW-9697 Add support for TOSA CONV3D Added initial support for TOSA CONV3D. Change-Id: Id4de4a11da26a555f05941f08d8f176309fcefda Signed-off-by: Fredrik Svedberg --- ethosu/regor/architecture/architecture.hpp | 1 + .../ethos_u55_register_cs_generator.cpp | 1 + .../regor/architecture/ethosu85/ethos_u85.cpp | 2 +- .../ethos_u85_register_cs_generator.cpp | 21 +- .../high_level_command_stream_generator.cpp | 10 +- ethosu/regor/compiler/scheduler.cpp | 14 +- ethosu/regor/compiler/scheduler.hpp | 5 +- ethosu/regor/compiler/scheduler_decompose.cpp | 213 ++++++++++++++++-- ethosu/regor/compiler/scheduler_decompose.hpp | 1 + ethosu/regor/compiler/scheduler_operation.hpp | 17 ++ ethosu/regor/compiler/scheduler_packing.cpp | 3 + .../test/test_fast_storage_allocator.cpp | 1 + .../regor/tflite/custom_operator_ethosu.hpp | 5 +- ethosu/regor/tosa/tosa_reader.cpp | 7 +- 14 files changed, 265 insertions(+), 36 deletions(-) diff --git a/ethosu/regor/architecture/architecture.hpp b/ethosu/regor/architecture/architecture.hpp index 7546e762..cdf3e104 100644 --- a/ethosu/regor/architecture/architecture.hpp +++ b/ethosu/regor/architecture/architecture.hpp @@ -202,6 +202,7 @@ struct ArchitectureConfigQuery Shape ofmShape; Shape ifmShape[2]; int ifmBits; + int ofmBits; Kernel *kernel; int lutBytes; bool scaled; diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp index 11469d1e..332bb96c 100644 --- a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp @@ -1447,6 +1447,7 @@ void EthosU55RCSGenerator::InsertTransposeCommand(const HLCStripe *stripe, Tempo ArchitectureConfigQuery query{}; query.kernel = &cmd->operation->kernel; query.ifmBits = DataTypeSizeBits(ifm.dataType); + query.ofmBits = DataTypeSizeBits(ofm.dataType); query.ifmShape[0] = inFM.shape; query.ofmShape = outFM.shape; query.ofmFormat = TensorFormat::NHWC; diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85.cpp index 28e23e70..1f3907b7 100644 --- a/ethosu/regor/architecture/ethosu85/ethos_u85.cpp +++ b/ethosu/regor/architecture/ethosu85/ethos_u85.cpp @@ -912,7 +912,7 @@ std::unique_ptr ArchEthosU85::FindBlockConfig(OpType opTyp // Accumulator settings EthosU85Accumulator accType = EthosU85Accumulator::Acc32; if ( (query.ifmBits == 16 && !isPooling && query.scaled) || // Normal 16-bit selection - (query.ifmBits > 32) ) // Special case for Rescale int48 + (query.ifmBits > 32) || (query.ofmBits > 32) ) // Special case for Rescale int48 { accType = EthosU85Accumulator::Acc48; } diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp index 9412c0b0..c2f352b8 100644 --- a/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp +++ b/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp @@ -1470,6 +1470,10 @@ void EthosU85RCSGenerator::GenerateAccFormat(const HLCStripe *stripe) { accType = config->Acc(); accSrc = config->AccSource(); + assert( + accSrc != ArchAccumulatorSource::Ifm2 || + (stripe->operation->ifm[1].dataType == DataType::Int32 && accType == EthosU85Accumulator::Acc32) || + (stripe->operation->ifm[1].dataType == DataType::Int64 && accType == EthosU85Accumulator::Acc48)); } acc_format format = accType == EthosU85Accumulator::Acc32 ? acc_format::I32 : acc_format::I48; @@ -1694,8 +1698,9 @@ void EthosU85RCSGenerator::GenerateOperationCode(const HLCOperation *op) } else if ( IsConvolution(opType) || IsVectorProduct(opType) ) { - // Dynamic weights when op->ifm.size() == 2, _weights_ifm2 parameter should be True - Emit(isa::npu_op_conv_t(op->ifm.size() == 2)); + // Dynamic weights when op->ifm.size() == 2 and acc source != ifm2, _weights_ifm2 parameter should be True + auto accSource = static_cast(op->config)->AccSource(); + Emit(isa::npu_op_conv_t(op->ifm.size() == 2 && accSource != ArchAccumulatorSource::Ifm2)); } else if ( IsElementwise(opType) ) { @@ -1777,18 +1782,22 @@ void EthosU85RCSGenerator::GenerateCommon(const HLCStripe *stripe, bool useGloba void EthosU85RCSGenerator::GenerateConvolutionOp(const HLCStripe *stripe, MemoryAccesses &memoryAccesses) { auto op = stripe->operation.get(); + EthosU85OpConfig *config = static_cast(op->config); QuantizedScale ofmScale(1, 0); bool useGlobalScale = false; ethosU85Scaling::RescaleConvolution(op); if ( op->ifm.size() == 2 ) { - // Dynamic weights - assert(ToActivationPrecision(op->ifm[0].dataType) == ToActivationPrecision(op->ifm[1].dataType)); - useGlobalScale = true; GenerateIFM2Precision(op->ifm[1], false, false); GenerateIFM2(op->type, op->ifm[1], stripe->ifmAreas[1], false, 0, -1); - Emit(isa::npu_set_weight_format_t(weight_format::SWD, weight_sparsity::NONE)); // Reset weight format + if ( config->AccSource() != ArchAccumulatorSource::Ifm2 ) + { + // Dynamic weights + assert(ToActivationPrecision(op->ifm[0].dataType) == ToActivationPrecision(op->ifm[1].dataType)); + useGlobalScale = true; + Emit(isa::npu_set_weight_format_t(weight_format::SWD, weight_sparsity::NONE)); // Reset weight format + } } if ( !op->ofm.quantization.scales.empty() ) diff --git a/ethosu/regor/compiler/high_level_command_stream_generator.cpp b/ethosu/regor/compiler/high_level_command_stream_generator.cpp index 9b9b5863..c73a0891 100644 --- a/ethosu/regor/compiler/high_level_command_stream_generator.cpp +++ b/ethosu/regor/compiler/high_level_command_stream_generator.cpp @@ -60,7 +60,7 @@ enum class TransformLimit static Box TransformWithStridesAndSkirt(const Box &outputArea, const Shape *strides, const Point2i &inputStep, const HLCPadding *skirt, const Shape &ifmShape, OpType opType, const Shape &concatOffsets, const Shape &splitOffset, const Shape &splitShape, int dilatedKernelHeight, int upscalingFactor, int &padTop, int &padBottom, - TransformLimit limit = TransformLimit::None, TransposeType transposeType = TransposeType::None) + TransformLimit limit = TransformLimit::None, TransposeType transposeType = TransposeType::None, bool accIfm = false) { Shape outputAreaStart = outputArea.Start().Unpermute(uint32_t(transposeType)); Shape outputAreaEnd = outputArea.End().Unpermute(uint32_t(transposeType)); @@ -109,6 +109,9 @@ static Box TransformWithStridesAndSkirt(const Box &outputArea, const Shape *stri start = splitOffset; end = start + splitShape; } + + if ( accIfm ) return Box(start, end); + end = Shape::Min(end, Shape::Max(ifmShape, Shape(1, 1, 1, 1)).WithHW(ifmShape.Height() * upscalingFactor, ifmShape.Width() * upscalingFactor)); padTop = 0; padBottom = 0; @@ -170,7 +173,7 @@ static Box TransformWithStridesAndSkirt(const Box &outputArea, const Shape *stri } static std::pair TransformWithInputOutputSteps(const Box &inputArea, const Point2i &inputStep, - const Box &outputArea, const Point2i &outputStep, class Kernel *kernel, const HLCPadding &padding, const Shape &ifmShape) + const Box &outputArea, const Point2i &outputStep, const Kernel *kernel, const HLCPadding &padding, const Shape &ifmShape) { const auto &stride = kernel->Stride(); const auto dilatedWH = kernel->DilatedWH(); @@ -663,10 +666,11 @@ void HLCStreamGenerator::GenerateHLCStripeCommands(SchedulerOperation *op, const { if ( !IsIFM(fm.usage) ) continue; auto ifmConn = op->Input(fm.usage); + bool accIfm = op->AccumulatorMode().source == AccumulatorSource::Ifm2 && fm.usage == TensorUsage::IFM1; // Calculate input area based on the output area auto inputArea = TransformWithStridesAndSkirt(outputArea, &strides, ifmConn->stepXY, &skirt, ifmConn->shape, opType, ofmConn->slice.offset, ifmConn->slice.offset, ifmConn->slice.shape, dilatedKernelHeight, - upscaling, hlcStripe->padding.top, hlcStripe->padding.bottom, ifmLimit, ofmConn->transpose); + upscaling, hlcStripe->padding.top, hlcStripe->padding.bottom, ifmLimit, ofmConn->transpose, accIfm); if ( ofmConn->stepXY != Point2i{1, 1} || ifmConn->stepXY != Point2i{1, 1} ) { std::tie(inputArea, hlcStripe->padding) = TransformWithInputOutputSteps(inputArea, diff --git a/ethosu/regor/compiler/scheduler.cpp b/ethosu/regor/compiler/scheduler.cpp index 9a5ea29a..40241bb5 100644 --- a/ethosu/regor/compiler/scheduler.cpp +++ b/ethosu/regor/compiler/scheduler.cpp @@ -391,6 +391,7 @@ std::unique_ptr GetOpConfig(Architecture *arch, SchedulerO query.ifmShape[0] = ifmShape; query.ifmShape[1] = ifm2Shape; query.ifmBits = DataTypeSizeBits(ifm->tensor->dataType); + query.ofmBits = DataTypeSizeBits(ofm->tensor->dataType); query.kernel = op->Kernel(); query.lutBytes = op->TryInput(TensorUsage::LUT) ? 2048 : 0; query.scaled = op->HasScaling(); @@ -680,7 +681,8 @@ std::unique_ptr Scheduler::CreateSchedulerOpInfo( { blockConfig = parentInfo ? parentInfo->Config()->Clone() : GetOpConfig(_arch, op, ifmShape, ifm2Shape, ofmShape, weightFormat); } - if ( !weights && op->OFM()->quantization.scales.size() > 1 ) + auto scales = op->TryInput(TensorUsage::Scales); + if ( !weights && (op->OFM()->quantization.scales.size() > 1 || scales) ) { WeightsRef weightsRef; weightsRef.isScales = true; @@ -690,7 +692,8 @@ std::unique_ptr Scheduler::CreateSchedulerOpInfo( auto encodingParams = _arch->WeightEncoder()->GetEncodingConfig( blockConfig.get(), weightsRef, op->Kernel(), ifm->tensor->dataType, depthOffsets, weightFormat); - weightScales = EncodeQuantizationScaleTensor(std::move(encodingParams), op->OFM()->quantization); + const SchedulerTensor *scaleTensor = scales ? scales->tensor.get() : nullptr; + weightScales = EncodeQuantizationScaleTensor(std::move(encodingParams), op->OFM()->quantization, scaleTensor); } // Finally construct and populate operator information (cost) auto opInfo = std::make_unique(std::move(blockConfig), ifmShape, ifm2Shape, ofmShape); @@ -1852,12 +1855,13 @@ static int ApplyZeroPointOHWI(const WeightTransformParam *param, int value) return value; } -WeightScaleTensors Scheduler::EncodeQuantizationScaleTensor(std::unique_ptr encodingParams, Quantization &ofmQuantization) +WeightScaleTensors Scheduler::EncodeQuantizationScaleTensor(std::unique_ptr encodingParams, + const Quantization &ofmQuantization, const SchedulerTensor *scales) { SchedulerTensor scaleTens; scaleTens.dataType = DataType::Int32; - - return TryEncodeWeightAndScaleTensor(encodingParams.get(), nullptr, &scaleTens, {}, ofmQuantization, false, true); + if ( scales == nullptr ) scales = &scaleTens; + return TryEncodeWeightAndScaleTensor(encodingParams.get(), nullptr, scales, {}, ofmQuantization, false, true); } WeightScaleTensors Scheduler::EncodeWeightAndScaleTensor(std::unique_ptr encodingParams, const SchedulerTensor *weightTens, diff --git a/ethosu/regor/compiler/scheduler.hpp b/ethosu/regor/compiler/scheduler.hpp index 1aff6c59..2abc634c 100644 --- a/ethosu/regor/compiler/scheduler.hpp +++ b/ethosu/regor/compiler/scheduler.hpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2021-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -351,7 +351,8 @@ private: void PrintSchedule(Schedule *schedule); - WeightScaleTensors EncodeQuantizationScaleTensor(std::unique_ptr encodingParams, Quantization &ofmQuantization); + WeightScaleTensors EncodeQuantizationScaleTensor(std::unique_ptr encodingParams, + const Quantization &ofmQuantization, const SchedulerTensor *scales = nullptr); WeightScaleTensors EncodeWeightAndScaleTensor(std::unique_ptr encodingParams, const SchedulerTensor *weightTens, const SchedulerTensor *scaleTens, const Quantization &weightQuantization, const Quantization &ofmQuantization); diff --git a/ethosu/regor/compiler/scheduler_decompose.cpp b/ethosu/regor/compiler/scheduler_decompose.cpp index f5292271..947f7821 100644 --- a/ethosu/regor/compiler/scheduler_decompose.cpp +++ b/ethosu/regor/compiler/scheduler_decompose.cpp @@ -118,11 +118,12 @@ static std::unique_ptr MakeTransposeOp( return op; } -static std::unique_ptr MakeSubOperation(const SchedulerOperation *schedOp, const Kernel *newKernel = nullptr) +static std::unique_ptr +MakeSubOperation(const SchedulerOperation *schedOp, const Kernel *newKernel = nullptr, OpType type = OpType::None) { assert(schedOp->SubOps().empty()); assert(schedOp->Parent() == nullptr); - auto subOp = std::make_unique(schedOp->Type()); + auto subOp = std::make_unique(type != OpType::None ? type : schedOp->Type()); subOp->SetKernel(newKernel ? newKernel : schedOp->Kernel()); subOp->SetHasScaling(schedOp->HasScaling()); subOp->_srcKey = schedOp->_srcKey; @@ -179,6 +180,7 @@ static std::unique_ptr GetOpConfig(Architecture *arch, con qConfig.ifmShape[1] = ifm1->SliceShape(); } qConfig.ifmBits = DataTypeSizeBits(ifm->tensor->dataType); + qConfig.ofmBits = DataTypeSizeBits(ofm->tensor->dataType); qConfig.kernel = schedOp->Kernel(); qConfig.lutBytes = schedOp->TryInput(TensorUsage::LUT) ? 2048 : 0; qConfig.scaled = schedOp->HasScaling(); @@ -264,6 +266,7 @@ bool CanRunOnHardware(Architecture *arch, const SchedulerOperation *schedOp) bool CanDecompose(Architecture *, const SchedulerOperation *schedOp) { if ( schedOp->Type() == OpType::Conv2D ) return true; + if ( schedOp->Type() == OpType::Conv3D ) return true; if ( schedOp->Type() == OpType::DepthwiseConv2D ) return true; if ( schedOp->Type() == OpType::TransposeConv2D ) return true; if ( DecomposeAsElementwise(schedOp->Type()) || schedOp->Type() == OpType::MemoryCopy ) return true; @@ -487,6 +490,9 @@ template static std::shared_ptr SliceT(SchedulerTensor *tensor, const Shape &offset, const Shape &shape, const Shape &readShape, const Point2i &stepXY) { + constexpr int MAX_RANK = 5; + assert(shape.Size() <= MAX_RANK); + assert(offset.Size() <= MAX_RANK); auto paddedInShape = Shape::PadAxes(readShape ? readShape : tensor->bufferView.ViewShape(), shape.Size(), 1); const auto &inBufferView = tensor->bufferView.Reshape(paddedInShape).SubView(offset, shape); const auto &inBufferValues = inBufferView.Values(); @@ -498,21 +504,21 @@ SliceT(SchedulerTensor *tensor, const Shape &offset, const Shape &shape, const S auto outBufferValues = outBufferView.WritableValues(); // Copy values into the output buffer - auto paddedOutShape = Shape::PadAxes(shape, 4, 1); - int batch = paddedOutShape.Batch(); - int height = paddedOutShape.Height(); - int width = paddedOutShape.Width(); - int depth = paddedOutShape.Depth(); - for ( int n = 0; n < batch; n++ ) + auto paddedOutShape = Shape::PadAxes(shape, MAX_RANK, 1); + auto ndhwc = paddedOutShape.WithZeros(); + for ( ndhwc[0] = 0; ndhwc[0] < paddedOutShape[0]; ndhwc[0]++ ) { - for ( int h = 0; h < height; h += stepXY.y ) + for ( ndhwc[1] = 0; ndhwc[1] < paddedOutShape[1]; ndhwc[1]++ ) { - for ( int w = 0; w < width; w += stepXY.x ) + for ( ndhwc[2] = 0; ndhwc[2] < paddedOutShape[2]; ndhwc[2] += stepXY.y ) { - for ( int c = 0; c < depth; c++ ) + for ( ndhwc[3] = 0; ndhwc[3] < paddedOutShape[3]; ndhwc[3] += stepXY.x ) { - Shape pos({n, h, w, c}, shape.Size()); - outBufferValues[pos] = inBufferValues[pos]; + for ( ndhwc[4] = 0; ndhwc[4] < paddedOutShape[4]; ndhwc[4]++ ) + { + Shape pos(ndhwc, shape.Size()); + outBufferValues[pos] = inBufferValues[pos]; + } } } } @@ -748,6 +754,7 @@ DecomposeForStrides(Architecture *arch, std::unique_ptr op, auto weightStepXY = Point2i{SX, SY}; auto newKernel = kernel->WithStride({1, 1}).WithSize({newWidth, newHeight}); std::unique_ptr subOp = MakeSubOperation(op.get(), &newKernel); + subOp->RemoveInput(TensorUsage::IFM1); // Remove acc input auto *subIfmConn = subOp->Input(TensorUsage::IFM); subIfmConn->slice = std::move(newIfmSlice); subIfmConn->stepXY = ifmStrides; @@ -768,8 +775,15 @@ DecomposeForStrides(Architecture *arch, std::unique_ptr op, accMode.outputEnabled = true; result.back()->SetAccumulatorMode(accMode); accMode = result.front()->AccumulatorMode(); - accMode.source = AccumulatorSource::Reset; + accMode.source = op->AccumulatorMode().source; result.front()->SetAccumulatorMode(accMode); + // Reconnect acc input + if ( accMode.source == AccumulatorSource::Ifm2 ) + { + auto subOpIfm2 = result.front()->AddInput(TensorUsage::IFM1); + *subOpIfm2 = *op->Input(TensorUsage::IFM1); + subOpIfm2->tensor->consumers.push_back(result.front().get()); + } return result; } @@ -827,6 +841,177 @@ std::vector> DecomposeConv2D(Architecture *a return result; } +std::vector> DecomposeConv3D(Architecture *arch, std::unique_ptr op) +{ + std::vector> result; + auto *ofmConn = op->Output(TensorUsage::OFM); + auto *ifmConn = op->Input(TensorUsage::IFM); + auto *weightsConn = op->Input(TensorUsage::Weights); + const auto &ofmShape = ofmConn->SliceShape(); + const auto &ifmShape = ifmConn->SliceShape(); + auto &ofmSlice = ofmConn->slice; + auto &ifmSlice = ifmConn->slice; + auto *kernel = op->Kernel(); + auto &padding = kernel->Padding(); + ofmSlice.Initialize(ofmShape.WithZeros(), ofmShape); + ifmSlice.Initialize(ifmShape.WithZeros(), ifmShape); + + if ( ofmShape[0] > 1 ) // Batch + { + return DecomposeLeadingDimensions(1, arch, std::move(op), DecomposeConv3D); + } + const int OD = ofmSlice.shape[1]; + const int ID = ifmSlice.shape[1]; + const int KD = kernel->Size3D().z; + if ( (arch->Constraints()->SupportsAccumulatorSaveRestore() || KD == 1) && weightsConn->tensor->IsConstant() ) + { + auto InitConnection = [](SchedulerConnection *dst, SchedulerConnection *src, int dOffset, int dSize) + { + dst->shape = Shape(src->SliceShape(), 4); + // Handle batch + dst->shape[0] *= src->shape[0]; + dst->slice.offset = dst->shape.WithZeros().WithBatch(src->slice.offset[0] * dSize + dOffset); + dst->slice.shape = dst->shape.WithBatch(1); + }; + // Create SchedulerTensor for ACC + auto acc = std::make_shared(); + acc->memArea = ofmConn->tensor->memArea; + acc->dataType = ifmConn->tensor->dataType == DataType::Int16 ? DataType::Int64 : DataType::Int32; + acc->storageShape = Shape(ofmShape, 4).WithBatch(1); + acc->uid = acc->equivalenceId = GenerateUniqueId(); + const auto ifm0uid = GenerateUniqueId(); + for ( int od = 0; od < OD; od++ ) + { + std::vector> conv2dSubOps; + for ( int kd = 0; kd < KD; kd++ ) + { + const int id = od * kernel->Stride3D().z - padding.Near() + kd * kernel->Dilation3D().z; + if ( id >= 0 && id < ID ) + { + auto subOp = MakeSubOperation(op.get(), nullptr, OpType::Conv2D); + InitConnection(subOp->Output(TensorUsage::OFM), ofmConn, od, OD); + InitConnection(subOp->Input(TensorUsage::IFM), ifmConn, id, ID); + // Update slice offset for DecomposeConv2D pad handling + auto subOpIfm = subOp->Input(TensorUsage::IFM); + subOpIfm->slice.offset = subOpIfm->slice.offset.WithHW(-padding.Top(), -padding.Left()); + + auto subOpWeights = subOp->Input(TensorUsage::Weights); + if ( KD > 1 ) + { + auto offset = subOpWeights->shape.WithZeros().With(1, kd); + subOpWeights->tensor = Slice(subOpWeights->tensor.get(), offset, subOpWeights->shape.With(1, 1)); + subOpWeights->tensor->consumers.push_back(subOp.get()); + } + // New weight shape + auto subOpWeightShape = subOpWeights->shape.Erase(1); + subOpWeights->shape = subOpWeights->tensor->storageShape = subOpWeightShape; + subOpWeights->tensor->bufferView = subOpWeights->tensor->bufferView.Reshape(subOpWeightShape); + + conv2dSubOps.emplace_back(std::move(subOp)); + } + } + if ( conv2dSubOps.empty() ) + { + // Kernel in padding only area, need to broadcast bias to OFM, + // using Rescale with bias and ifm zero point as input + auto unitKernel = Kernel::UnitKernel(); + auto subOp = MakeSubOperation(op.get(), &unitKernel, OpType::Rescale); + subOp->RemoveInput(TensorUsage::Weights); + InitConnection(subOp->Output(TensorUsage::OFM), ofmConn, od, OD); + + // Create SchedulerTensor for 0 input + auto subOpIfm = subOp->Input(TensorUsage::IFM); + auto ifm0 = std::make_shared(); + ifm0->dataType = subOpIfm->tensor->dataType; + ifm0->memArea = subOp->Input(TensorUsage::Scales)->tensor->memArea; + ifm0->format = TensorFormat::NHWC; + const auto &ifm0shape = subOp->Output(TensorUsage::OFM)->slice.shape; + const auto bufSize = ifm0shape.Elements(); + const int64_t ifmZp = subOpIfm->quantization.zeroPoints.empty() ? 0 : subOpIfm->quantization.zeroPoints.front(); + std::shared_ptr ifm0buf; + switch ( ifm0->dataType ) + { + case DataType::Int8: + ifm0buf = std::make_shared(std::vector(bufSize, int8_t(ifmZp))); + break; + case DataType::Int16: + ifm0buf = std::make_shared(std::vector(bufSize, int16_t(ifmZp))); + break; + default: + assert(false && "Unsupported ifm data type"); + break; + } + ifm0->bufferView = BufferView(ifm0buf, 0, DataTypeStorageSizeBits(ifm0->dataType), ifm0shape, {}); + ifm0->storageShape = ifm0->bufferView.ViewShape(); + ifm0->uid = ifm0->equivalenceId = ifm0uid; + + subOpIfm->tensor = std::move(ifm0); + subOpIfm->tensor->consumers.push_back(subOp.get()); + subOpIfm->shape = ifm0shape; + subOpIfm->slice.offset = ifm0shape.WithZeros(); + subOpIfm->slice.shape = ifm0shape; + + // TODO: MLBEDSW-9759 Pooling Decomposition + result.emplace_back(std::move(subOp)); + } + else if ( conv2dSubOps.size() > 1 ) + { + auto &tail = conv2dSubOps.back(); + auto bias = tail->Input(TensorUsage::Scales); + + // Create SchedulerTensor for 0 (no) bias + auto bias0 = std::make_shared(*bias->tensor); + auto bias0buf = std::make_shared(std::make_unique(0)); + assert(DataTypeStorageSizeBits(bias0->dataType) <= int(8 * sizeof(int64_t))); + bias0->bufferView = BufferView(bias0buf, 0, DataTypeStorageSizeBits(bias0->dataType), {1}, {}); + bias0->storageShape = bias0->bufferView.ViewShape(); + bias0->uid = bias0->equivalenceId = GenerateUniqueId(); + bias0->consumers.clear(); + + for ( auto subOp = conv2dSubOps.begin(); subOp != conv2dSubOps.end(); ++subOp ) + { + if ( subOp != conv2dSubOps.begin() ) + { + // Acc source ifm2 for all but first subop + (*subOp)->AddInput(TensorUsage::IFM1, acc)->shape = acc->storageShape; + (*subOp)->SetAccumulatorMode({AccumulatorSource::Ifm2, true}); + } + if ( *subOp != tail ) + { + // Remove scaling and bias and set ofm = acc tensor + // (used as acc input for next op) for all but last subop + auto subOpOfm = (*subOp)->OFM(); + auto subOpIfm = (*subOp)->IFM(0); + auto subOpWeights = (*subOp)->Input(TensorUsage::Weights); + auto subOpBias = (*subOp)->Input(TensorUsage::Scales); + subOpOfm->tensor = acc; + subOpOfm->tensor->producers.push_back((*subOp).get()); + subOpOfm->shape = acc->storageShape; + subOpOfm->slice.offset = subOpOfm->shape.WithZeros(); + subOpOfm->quantization.scales = {QuantizedScale::Unit()}; + subOpIfm->quantization.scales = {QuantizedScale::Unit()}; + subOpWeights->quantization.scales = {QuantizedScale::Unit()}; + subOpBias->tensor->RemoveReader((*subOp).get()); + subOpBias->tensor = bias0; + subOpBias->tensor->consumers.push_back((*subOp).get()); + subOpBias->shape = bias0->storageShape; + } + } + } + auto end = std::make_move_iterator(conv2dSubOps.end()); + for ( auto subOp = std::make_move_iterator(conv2dSubOps.begin()); subOp != end; ++subOp ) + { + auto subOps = DecomposeConv2D(arch, *subOp); + result.insert(result.end(), std::make_move_iterator(subOps.begin()), std::make_move_iterator(subOps.end())); + } + } + return result; + } + // If we get here, decomposition has failed, the resulting operations will be executed on CPU + result.emplace_back(std::move(op)); + return result; +} + std::vector> DecomposeDepthwiseConv2D(Architecture *arch, std::unique_ptr op) { std::vector> result; diff --git a/ethosu/regor/compiler/scheduler_decompose.hpp b/ethosu/regor/compiler/scheduler_decompose.hpp index 81c16930..76e4c99c 100644 --- a/ethosu/regor/compiler/scheduler_decompose.hpp +++ b/ethosu/regor/compiler/scheduler_decompose.hpp @@ -36,6 +36,7 @@ bool NeedsDecompose(Architecture *arch, const SchedulerOperation *schedOp); bool CanRunOnHardware(Architecture *arch, const SchedulerOperation *schedOp); bool CanDecompose(Architecture *arch, const SchedulerOperation *schedOp); std::vector> DecomposeConv2D(Architecture *arch, std::unique_ptr op); +std::vector> DecomposeConv3D(Architecture *arch, std::unique_ptr op); std::vector> DecomposeDepthwiseConv2D(Architecture *arch, std::unique_ptr op); std::vector> DecomposeTransposeConv2D(Architecture *arch, std::unique_ptr op); std::vector> DecomposeElementwise(Architecture *arch, std::unique_ptr op); diff --git a/ethosu/regor/compiler/scheduler_operation.hpp b/ethosu/regor/compiler/scheduler_operation.hpp index f1462d07..1be755d5 100644 --- a/ethosu/regor/compiler/scheduler_operation.hpp +++ b/ethosu/regor/compiler/scheduler_operation.hpp @@ -71,6 +71,12 @@ public: this->uid = GenerateUniqueId(); } + void RemoveReader(const SchedulerOperation *op) + { + auto end = std::remove(consumers.begin(), consumers.end(), op); + consumers.erase(end, consumers.end()); + } + void SetAddress(Address address) { assert(allocatedAddress == -1 && address >= 0); @@ -225,6 +231,17 @@ public: SchedulerConnection *IFM(int index) { return &inputs.at(MakeTensorUsage(TensorUsage::IFM, index)); } const SchedulerConnection *IFM(int index) const { return &inputs.at(MakeTensorUsage(TensorUsage::IFM, index)); } + // Invalidates all pointers to input connections. + void RemoveInput(TensorUsage usage) + { + auto inputConnection = inputs.try_ref(usage); + if ( inputConnection ) + { + if ( inputConnection->tensor ) inputConnection->tensor->RemoveReader(this); + inputs.erase(usage); + } + } + // Output connections SchedulerConnection *AddOutput(TensorUsage usage) { return &outputs[usage]; } diff --git a/ethosu/regor/compiler/scheduler_packing.cpp b/ethosu/regor/compiler/scheduler_packing.cpp index dfa3c657..53aa06c0 100644 --- a/ethosu/regor/compiler/scheduler_packing.cpp +++ b/ethosu/regor/compiler/scheduler_packing.cpp @@ -558,6 +558,9 @@ std::vector> SchedulerPacking::DecomposeSche case OpType::Conv2D: result = DecomposeConv2D(_arch, std::move(op)); break; + case OpType::Conv3D: + result = DecomposeConv3D(_arch, std::move(op)); + break; case OpType::DepthwiseConv2D: result = DecomposeDepthwiseConv2D(_arch, std::move(op)); break; diff --git a/ethosu/regor/test/test_fast_storage_allocator.cpp b/ethosu/regor/test/test_fast_storage_allocator.cpp index fe1a2d1f..bec7d6f1 100644 --- a/ethosu/regor/test/test_fast_storage_allocator.cpp +++ b/ethosu/regor/test/test_fast_storage_allocator.cpp @@ -76,6 +76,7 @@ static std::unique_ptr CreateSchedule(std::unique_ptr &a ArchitectureConfigQuery query{}; query.kernel = op->Kernel(); query.ifmBits = DataTypeSizeBits(ifm->tensor->dataType); + query.ofmBits = DataTypeSizeBits(ofm->tensor->dataType); query.ifmShape[0] = ifm->shape; query.ofmShape = ofm->shape; query.transpose = TransposeType::None; diff --git a/ethosu/regor/tflite/custom_operator_ethosu.hpp b/ethosu/regor/tflite/custom_operator_ethosu.hpp index ec4c7eba..f9ddbeb1 100644 --- a/ethosu/regor/tflite/custom_operator_ethosu.hpp +++ b/ethosu/regor/tflite/custom_operator_ethosu.hpp @@ -220,14 +220,15 @@ private: { const auto offset = tensor->AllocatedAddress(); const auto allocation = tensor->AllocationSizeBytes(); - const auto size = tensor->srcTensor->View().Buffer()->Size(); + const auto buffer = tensor->srcTensor ? tensor->srcTensor->View().Buffer() : tensor->bufferView.Buffer(); + const auto size = buffer->Size(); assert(tensor->memArea.usage % MemUsage::ReadOnly); assert((offset >= 0) && (allocation >= 0)); // Has been allocated assert((offset + allocation) <= Address(_readOnlyBuffer->Size())); // Allocation fits in buffer assert(size <= allocation); // Tensor fits in allocation - std::copy_n(tensor->srcTensor->View().Buffer()->Data(), size, _readOnlyBuffer->Data() + offset); + std::copy_n(buffer->Data(), size, _readOnlyBuffer->Data() + offset); } } }; diff --git a/ethosu/regor/tosa/tosa_reader.cpp b/ethosu/regor/tosa/tosa_reader.cpp index 582bf4db..75afcc69 100644 --- a/ethosu/regor/tosa/tosa_reader.cpp +++ b/ethosu/regor/tosa/tosa_reader.cpp @@ -445,9 +445,10 @@ void TosaReader::LoadGraphs(const tosaFb::TosaGraph *model, std::list 1); const auto &shape = shapes.at(input_tensors[1]); - kernel.sizeYXZ[0] = shape.axisNHWC[1]; - kernel.sizeYXZ[1] = shape.axisNHWC[2]; - kernel.sizeYXZ[2] = shape.axisNHWC[0]; + tosa_assert(shape.count == 5); + kernel.sizeYXZ[0] = shape.axisNHWC[2]; + kernel.sizeYXZ[1] = shape.axisNHWC[3]; + kernel.sizeYXZ[2] = shape.axisNHWC[1]; const auto &attr = TosaAttr::Get(tosa_operator); tosa_assert(attr.pad()); tosa_assert(attr.pad()->size() == 6); -- GitLab