From c52f14907d5e8283a062c4b60ab8c008244d43bc Mon Sep 17 00:00:00 2001 From: Jacob Bohlin Date: Wed, 15 Jan 2025 18:38:10 +0000 Subject: [PATCH] MLBEDSW-8588 Add support for ConvGroups in Regor Change-Id: Idace12b6fe663722b8c50cc8e3c475feca044ebd Signed-off-by: Jacob Bohlin --- .../regor/compiler/tflite_graph_optimiser.cpp | 142 ++++++++++++++++++ .../regor/compiler/tflite_graph_optimiser.hpp | 7 + 2 files changed, 149 insertions(+) diff --git a/ethosu/regor/compiler/tflite_graph_optimiser.cpp b/ethosu/regor/compiler/tflite_graph_optimiser.cpp index fc805e15..c88282ed 100644 --- a/ethosu/regor/compiler/tflite_graph_optimiser.cpp +++ b/ethosu/regor/compiler/tflite_graph_optimiser.cpp @@ -2797,6 +2797,148 @@ Operation *TFLiteGraphOptimiser::ConvertZeroPoint(Graph *const graph, Operation return operation; } +// Return a slice of a tensor +template +static std::shared_ptr +SliceConstTensor(const TensorConnection *conn, const Shape &sliceShape, const Shape &sliceOffset, const std::string &Name) +{ + assert((sliceShape.Size() == 4) && (sliceOffset.Size() == 4)); + + // Create a sub-view to read only a slice of the tensor + auto subBufferView = conn->tensor->View().SubView(sliceOffset, sliceShape); + BufferReader values = subBufferView.Values(); + + // Create a new buffer to hold the slice + int size = sliceShape.Elements(); + auto newBuffer = std::make_shared(std::make_unique(size), size); + BufferView newBufferView(newBuffer, 0, 8 * sizeof(TYPE), sliceShape, {}); + auto newValues = newBufferView.WritableValues(); + + // Copy the values over to the new buffer + for ( int n = 0; n < sliceShape.Batch(); n++ ) + { + for ( int h = 0; h < sliceShape.Height(); h++ ) + { + for ( int w = 0; w < sliceShape.Width(); w++ ) + { + for ( int c = 0; c < sliceShape.Depth(); c++ ) + { + Shape pos({n, h, w, c}, sliceShape.Size()); + newValues[pos] = values[pos]; + } + } + } + } + + return std::make_shared(Name, conn->tensor->Type(), sliceShape, std::move(newBuffer)); +} + +// Converts a convolution group with N groups into N * Conv2D ops each operating on a 1/N part of +// the original channels. Finally, all of the individual results will be concatenated depth-wise into +// the OFM tensor. +Operation *TFLiteGraphOptimiser::ConvertConvolutionGroup(Graph *const graph, Operation *const operation) +{ + UNUSED(graph); + if ( operation->Type() != OpType::Conv2D ) + { + return operation; + } + + const auto &ifmConn = operation->Input(TensorUsage::IFM0); + const auto &ifmShape = ifmConn->shape; + const auto &weightConn = operation->Input(TensorUsage::Weights); + const auto &weightShape = weightConn->shape; + const auto &biasConn = operation->Input(TensorUsage::Scales); + const auto &biasShape = biasConn->shape; + const auto &ofmConn = operation->Output(TensorUsage::OFM); + const auto &ofmShape = ofmConn->shape; + + // Calculate the number of convolution groups based of the shape of the IFM read by + // the convolution, accounting for partial reads of the IFM. + auto ifmReadShape = ifmConn->slice.shape.IsEmpty() ? ifmShape : ifmConn->slice.shape; + auto numGroups = ifmReadShape.Depth() / weightShape.Depth(); + if ( numGroups == 1 ) + { + return operation; + } + + // Create final Concat operation + auto concatOp = std::make_shared(OpType::Concat); + concatOp->CopyOutput(TensorUsage::OFM, *ofmConn); + concatOp->Attribute()->axis = -1; + + // Create 'numGroups' number of convolutions, each reading a depth-wise slice of the IFM. + int kernelsPerGroup = weightShape.Batch() / numGroups; + Shape zeroShape = ifmReadShape.WithZeros(); + Shape ifmSlice = ifmReadShape.WithDepth(ifmReadShape.Depth() / numGroups); + Shape ofmSlice = ofmShape.WithDepth(ofmShape.Depth() / numGroups); + Shape weightSlice = weightShape.WithBatch(kernelsPerGroup); + Shape biasSlice = biasShape.WithDepth(kernelsPerGroup); + + const auto &weightName = weightConn->tensor->Name(); + const auto &ofmName = ofmConn->tensor->Name(); + Operation *finalOp = nullptr; + for ( int i = 0; i < numGroups; i++ ) + { + // Create Convolution and connect the IFM sliced and offset + auto convGroupOp = std::make_shared(OpType::Conv2D); + convGroupOp->ConnectInput(TensorUsage::IFM0, ifmConn->tensor) + .Set(ifmReadShape) + .Set(ifmConn->quantization) + .Set({zeroShape.WithDepth(i * ifmSlice.Depth()), ifmSlice}); + + // Create and connect intermediate OFM + auto ofmConvGroup = std::make_shared(ofmName + "_convgroup_output" + std::to_string(i), ofmConn->tensor->Type(), ofmSlice); + convGroupOp->ConnectOutput(TensorUsage::OFM, ofmConvGroup).Set(ofmSlice).Set(ofmConn->quantization).Set(ofmConn->rounding); + + // Copy the kernel from the original operation + convGroupOp->SetKernel(std::make_unique(*operation->Kernel())); + + // Extract a slice out of the weight tensor + assert(weightConn->tensor->Type() & DataType::Bits8); + Shape weightOffset = zeroShape.WithBatch(i * weightSlice.Batch()); + auto weightSubTensor = + weightConn->tensor->Type() == DataType::UInt8 ? + SliceConstTensor(weightConn, weightSlice, weightOffset, weightName + "weights" + std::to_string(i)) : + SliceConstTensor(weightConn, weightSlice, weightOffset, weightName + "weights" + std::to_string(i)); + + // Slice quantization info for weights and bias + Quantization newWeightQuant = weightConn->quantization; + newWeightQuant.scales.clear(); + newWeightQuant.zeroPoints.clear(); + Quantization newBiasQuant = biasConn->quantization; + newBiasQuant.scales.clear(); + newBiasQuant.zeroPoints.clear(); + for ( int j = 0; j < kernelsPerGroup; j++ ) + { + newWeightQuant.scales.push_back(weightConn->quantization.scales[j + (i * kernelsPerGroup)]); + newWeightQuant.zeroPoints.push_back(weightConn->quantization.zeroPoints[j + (i * kernelsPerGroup)]); + newBiasQuant.scales.push_back(biasConn->quantization.scales[j + (i * kernelsPerGroup)]); + newBiasQuant.zeroPoints.push_back(biasConn->quantization.zeroPoints[j + (i * kernelsPerGroup)]); + } + + // Connect weights slice + convGroupOp->ConnectInput(TensorUsage::Weights, weightSubTensor).Set(weightShape).Set(newWeightQuant); + + // Connect the bias and scales slice + convGroupOp->ConnectInput(TensorUsage::Scales, biasConn->tensor) + .Set(biasShape) + .Set(newBiasQuant) + .Set({zeroShape.WithDepth(i * biasSlice.Depth()), biasSlice}); + + // Connect intermediate OFM to Concat op + concatOp->ConnectInput(MakeTensorUsage(TensorUsage::IFM, i), ofmConvGroup) + .Set(ofmSlice) + .Set(convGroupOp->Output(TensorUsage::OFM)->quantization); + + RecordOptimisation(operation, convGroupOp.get()); + } + + RecordOptimisation(operation, concatOp.get()); + operation->Disconnect(); + return concatOp.get(); +} + TFLiteGraphOptimiser::TFLiteGraphOptimiser(IArchitectureConstraints *constraints, const GraphOptimiserOptions &options, OptimiserDatabase *db) : GraphOptimiser(constraints, options, db) { diff --git a/ethosu/regor/compiler/tflite_graph_optimiser.hpp b/ethosu/regor/compiler/tflite_graph_optimiser.hpp index e76ff4dd..bc78d113 100644 --- a/ethosu/regor/compiler/tflite_graph_optimiser.hpp +++ b/ethosu/regor/compiler/tflite_graph_optimiser.hpp @@ -95,6 +95,7 @@ private: Operation *ConvertTanhSigmoidToLUT16(Operation *const op); // Rewrite functions + Operation *ConvertConvolutionGroup(Graph *const graph, Operation *const operation); Operation *ConvertExpToLUT(Graph *const graph, Operation *const operation); Operation *RewritePack(Graph *const graph, Operation *const operation); Operation *RewriteUnpack(Graph *const graph, Operation *const operation); @@ -197,6 +198,12 @@ public: #endif } }, + { + {}, + { + &TFLiteGraphOptimiser::ConvertConvolutionGroup, + } + }, { {}, { -- GitLab