diff --git a/ethosu/regor/compiler/cascade_builder.cpp b/ethosu/regor/compiler/cascade_builder.cpp index 37fff84034e7c3d0dc2bdbfeb5792d20de6a1c33..25b5f2a400c8eb3729198337b885ee7139206df0 100644 --- a/ethosu/regor/compiler/cascade_builder.cpp +++ b/ethosu/regor/compiler/cascade_builder.cpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2021-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -90,7 +90,7 @@ public: auto consumerCost = refSchedule->Cost(consumer); bufferShape = RollingBufferShape(producerCost->stripe, consumerCost->stripeInput[0]); - bufferSize = DataTypeStorageSizeBytes(ofm->tensor->dataType, bufferShape.Elements()); + bufferSize = DataTypeStorageSizeBytes(ofm->Type(), bufferShape.Elements()); } } _cache.emplace(key, CascadeBuffer(bufferShape, bufferSize)); diff --git a/ethosu/regor/compiler/high_level_command_stream_generator.cpp b/ethosu/regor/compiler/high_level_command_stream_generator.cpp index bd3ded4c76ccf5e09b1b9ba5bb268394ebcacc3d..b661fe34f4dbf3b71a563f99dcbea711d788ca0e 100644 --- a/ethosu/regor/compiler/high_level_command_stream_generator.cpp +++ b/ethosu/regor/compiler/high_level_command_stream_generator.cpp @@ -241,7 +241,7 @@ static void MakeFeatureMap(TensorUsage usage, const SchedulerConnection *schedCo auto schedTens = schedConn->tensor.get(); fm.shape = schedConn->shape; fm.slice = schedConn->slice; - fm.dataType = schedTens->dataType; + fm.dataType = schedConn->Type(); fm.memArea = schedTens->memArea; fm.format = schedTens->format; fm.usage = usage; @@ -325,7 +325,7 @@ static HLCSubOperation MakeSubOperation(const std::unique_ptrmemArea; param.address = lutTensor->AllocatedAddress(); param.sizeBytes = lutTensor->AllocationSizeBytes(); - param.ifmType = schedOp->IFM(0)->tensor->dataType; + param.ifmType = schedOp->IFM(0)->Type(); } return hlcSubOp; } @@ -395,7 +395,7 @@ static std::shared_ptr MakeOperation(SchedulerOperation *schedOp, param.memArea = lutTensor->memArea; param.address = lutTensor->AllocatedAddress(); param.sizeBytes = lutTensor->AllocationSizeBytes(); - param.ifmType = schedOp->IFM(0)->tensor->dataType; + param.ifmType = schedOp->IFM(0)->Type(); } for ( auto &subOp : schedOp->SubOps() ) @@ -448,7 +448,7 @@ static std::shared_ptr MakeOperation(SchedulerOperation *schedOp, auto *ifmConn = schedOp->Input(TensorUsage::IFM); auto *params = schedOp->Input(TensorUsage::Params); assert(params); - assert(params->tensor->dataType == DataType::Int32); + assert(params->Type() == DataType::Int32); auto view = params->tensor->srcTensor->View(); Shape multiples(view.Buffer()->Data(), view.ViewShape().Elements()); multiples = Shape::PadAxes(multiples, ifmConn->shape.Size(), 1); diff --git a/ethosu/regor/compiler/network_performance.cpp b/ethosu/regor/compiler/network_performance.cpp index b099534639d96620aa2c573293b82827b1f55e90..1567a43a7009eebda14f8e01b834b0c04b224ef9 100644 --- a/ethosu/regor/compiler/network_performance.cpp +++ b/ethosu/regor/compiler/network_performance.cpp @@ -319,9 +319,9 @@ void NetworkPerformance::AddToDatabase(const PerformanceResult &perf, SchedulerO fmt::format("{}", schedOp->TryIFM(1) ? EnumToString(schedOp->IFM(1)->tensor->format) : ""), fmt::format("{}", EnumToString(schedOp->OFM()->tensor->format)), // Data types - fmt::format("{}", EnumToString(schedOp->IFM(0)->tensor->dataType)), - fmt::format("{}", schedOp->TryIFM(1) ? EnumToString(schedOp->IFM(1)->tensor->dataType) : ""), - fmt::format("{}", EnumToString(schedOp->OFM()->tensor->dataType)), + fmt::format("{}", EnumToString(schedOp->IFM(0)->Type())), + fmt::format("{}", schedOp->TryIFM(1) ? EnumToString(schedOp->IFM(1)->Type()) : ""), + fmt::format("{}", EnumToString(schedOp->OFM()->Type())), // IFM Buffering std::to_string(schedOp->IFM(0)->preBuffer), schedOp->TryIFM(1) ? std::to_string(schedOp->IFM(1)->preBuffer) : "", @@ -503,21 +503,21 @@ PerformanceResult NetworkPerformance::EstimateFullOpPerformance( auto ofm = schedOp->OFM(); result.memory[ofm->tensor->memArea.memory].access[AccessType::FeatureMap].bytesWritten += byteAccess.ofmWrite; result.memory[ofm->tensor->memArea.memory] - .writeTransferOverhead += byteAccess.ofmWrite - DataTypeStorageSizeBytes(ofm->tensor->dataType, access.ofmWrite); + .writeTransferOverhead += byteAccess.ofmWrite - DataTypeStorageSizeBytes(ofm->Type(), access.ofmWrite); // IFM1 read auto ifm = schedOp->IFM(0); result.memory[ifm->tensor->memArea.memory].access[AccessType::FeatureMap].bytesRead += byteAccess.ifmRead[0]; result.memory[ifm->tensor->memArea.memory] - .readTransferOverhead += byteAccess.ifmRead[0] - DataTypeStorageSizeBytes(ifm->tensor->dataType, access.ifmRead[0]); + .readTransferOverhead += byteAccess.ifmRead[0] - DataTypeStorageSizeBytes(ifm->Type(), access.ifmRead[0]); // IFM2 read auto ifm2 = schedOp->TryIFM(1); if ( ifm2 ) { result.memory[ifm2->tensor->memArea.memory].access[AccessType::FeatureMap].bytesRead += byteAccess.ifmRead[1]; - result.memory[ifm2->tensor->memArea.memory].readTransferOverhead += - byteAccess.ifmRead[1] - DataTypeStorageSizeBytes(ifm2->tensor->dataType, access.ifmRead[1]); + result.memory[ifm2->tensor->memArea.memory] + .readTransferOverhead += byteAccess.ifmRead[1] - DataTypeStorageSizeBytes(ifm2->Type(), access.ifmRead[1]); } // Reads/writes to temporary or intermediate memories @@ -526,11 +526,11 @@ PerformanceResult NetworkPerformance::EstimateFullOpPerformance( { result.memory[scratch->tensor->memArea.memory].access[AccessType::FeatureMap].bytesRead += byteAccess.tmpRead; result.memory[scratch->tensor->memArea.memory] - .readTransferOverhead += byteAccess.tmpRead - DataTypeStorageSizeBytes(scratch->tensor->dataType, access.tmpRead); + .readTransferOverhead += byteAccess.tmpRead - DataTypeStorageSizeBytes(scratch->Type(), access.tmpRead); result.memory[scratch->tensor->memArea.memory].access[AccessType::FeatureMap].bytesWritten += byteAccess.tmpWrite; - result.memory[scratch->tensor->memArea.memory].readTransferOverhead += - byteAccess.tmpWrite - DataTypeStorageSizeBytes(scratch->tensor->dataType, access.tmpWrite); + result.memory[scratch->tensor->memArea.memory] + .readTransferOverhead += byteAccess.tmpWrite - DataTypeStorageSizeBytes(scratch->Type(), access.tmpWrite); } // Weight/scale reads diff --git a/ethosu/regor/compiler/scheduler.cpp b/ethosu/regor/compiler/scheduler.cpp index 114254d40c963f06c77fc6c6ca21f638eeacee67..df9c8d64909d3e50a7a7845b2e4f223c84c2bc9a 100644 --- a/ethosu/regor/compiler/scheduler.cpp +++ b/ethosu/regor/compiler/scheduler.cpp @@ -292,7 +292,7 @@ int Scheduler::UpdateSchedulerTensor(TensorUsage usage, SchedulerConnection *con continue; } // Int32 ReduceSum requires linear format - else if ( consumer->Type() == OpType::ReduceSum && tensor->dataType == DataType::Int32 ) + else if ( consumer->Type() == OpType::ReduceSum && conn->Type() == DataType::Int32 ) { tensor->needsLinearFormat = true; continue; @@ -424,8 +424,8 @@ std::unique_ptr GetOpConfig(Architecture *arch, SchedulerO query.ofmShape = Shape::PadAxes(ofmShape, 3, 1); query.ifmShape[0] = ifmShape; query.ifmShape[1] = ifm2Shape; - query.ifmBits = DataTypeSizeBits(ifm->tensor->dataType); - query.ofmBits = DataTypeSizeBits(ofm->tensor->dataType); + query.ifmBits = DataTypeSizeBits(ifm->Type()); + query.ofmBits = DataTypeSizeBits(ofm->Type()); query.kernel = op->Kernel(); query.lutBytes = op->TryInput(TensorUsage::LUT) ? 2048 : 0; query.scaled = op->HasScaling(); @@ -584,9 +584,9 @@ WeightScaleEncoding Scheduler::EncodeBestWeightFormat( std::vector encodingResults; auto weights = op->Input(TensorUsage::Weights); auto scales = op->Input(TensorUsage::Scales); - WeightsRef weightsRef = {&weights->tensor->bufferView, weights->tensor->srcTensor->AxisOrder(), weights->tensor->dataType}; + WeightsRef weightsRef = {&weights->tensor->bufferView, weights->tensor->srcTensor->AxisOrder(), weights->Type()}; auto ifm = op->IFM(op->PrimaryIfmIndex()); - auto ifmType = ifm->tensor->dataType; + auto ifmType = ifm->Type(); std::vector depthOffsets{0, ofmShape.Unpermute(uint32_t(op->OFM()->transpose)).Depth()}; std::vector formatList = {WF(WeightFormat::Default, WeightFormat::Sparse2_4), WF(WeightFormat::Default), @@ -712,7 +712,7 @@ std::unique_ptr Scheduler::CreateSchedulerOpInfo( { auto scales = op->Input(TensorUsage::Scales); auto temp = _arch->WeightEncoder()->MakeExplicit(ifm->quantization, weights->quantization, - op->OFM()->quantization, scales->tensor->dataType, ifm->tensor->dataType, op->Type()); + op->OFM()->quantization, scales->Type(), ifm->Type(), op->Type()); op->OFM()->quantization = std::move(temp); assert(op->OFM()->quantization.type == QuantizationType::EXPLICIT); } @@ -735,7 +735,7 @@ std::unique_ptr Scheduler::CreateSchedulerOpInfo( // The operation might have been decomposed in depth dimension and have an offset const int depthBase = op->OFM()->slice.offset ? op->OFM()->slice.offset.Depth() : 0; auto encodingParams = _arch->WeightEncoder()->GetEncodingConfig( - blockConfig.get(), weightsRef, op->Kernel(), ifm->tensor->dataType, depthBase, depthOffsets, weightFormat); + blockConfig.get(), weightsRef, op->Kernel(), ifm->Type(), depthBase, depthOffsets, weightFormat); const SchedulerTensor *scaleTensor = scales ? scales->tensor.get() : nullptr; weightScales = EncodeQuantizationScaleTensor(std::move(encodingParams), op->OFM()->quantization, scaleTensor); @@ -1673,7 +1673,7 @@ PerformanceQuery Scheduler::InitPerfQuery( SchedulerConnection *ifm0 = op->IFM(0); query.ifmShape[0] = ifm0->SliceShape(); query.ifmMemory[0] = ifm0->tensor->memArea.memory; - query.ifmType[0] = ifm0->tensor->dataType; + query.ifmType[0] = ifm0->Type(); query.ifmFormat[0] = ifm0->tensor->format; SchedulerConnection *ifm1 = op->TryIFM(1); @@ -1681,7 +1681,7 @@ PerformanceQuery Scheduler::InitPerfQuery( { query.ifmShape[1] = ifm1->SliceShape(); query.ifmMemory[1] = ifm1->tensor->memArea.memory; - query.ifmType[1] = ifm1->tensor->dataType; + query.ifmType[1] = ifm1->Type(); query.ifmFormat[1] = ifm1->tensor->format; } @@ -1689,7 +1689,7 @@ PerformanceQuery Scheduler::InitPerfQuery( ofmDepth = (ofmDepth >= 0) ? ofmDepth : ofm->SliceShape().Depth(); query.ofmShape = ofm->SliceShape().WithDepth(ofmDepth); query.ofmMemory = ofm->tensor->memArea.memory; - query.ofmType = ofm->tensor->dataType; + query.ofmType = ofm->Type(); query.ofmFormat = ofm->tensor->format; SchedulerConnection *scratch = op->TryInput(TensorUsage::Scratch); @@ -1740,7 +1740,7 @@ std::vector Scheduler::InitFusionQuery(SchedulerOperation *op) { fusedOp.ifm2Shape = ifm2->shape; fusedOp.ifm2Memory = ifm2->tensor->memArea.memory; - fusedOp.ifm2Type = ifm2->tensor->dataType; + fusedOp.ifm2Type = ifm2->Type(); fusedOp.ifm2Format = ifm2->tensor->format; } } diff --git a/ethosu/regor/compiler/scheduler_decompose.cpp b/ethosu/regor/compiler/scheduler_decompose.cpp index 51cf25185a06990fdca23f2a68d970a16622df76..0b5ba748c410b720f9bb1b2f8dab4ec04b83b83b 100644 --- a/ethosu/regor/compiler/scheduler_decompose.cpp +++ b/ethosu/regor/compiler/scheduler_decompose.cpp @@ -51,7 +51,6 @@ bool ShouldDecompose(Architecture *arch, const SchedulerOperation *schedOp) static std::unique_ptr MakeMemCopy(const std::shared_ptr &source, const std::shared_ptr &dest, const TensorSlice *ofmSlice = nullptr) { - assert(source->dataType == dest->dataType); assert(ofmSlice == nullptr || ofmSlice->shape + ofmSlice->offset <= dest->storageShape); auto op = std::make_unique(OpType::MemoryCopy); @@ -61,10 +60,10 @@ static std::unique_ptr MakeMemCopy(const std::shared_ptrAddOutput(TensorUsage::OFM); ofmConn->tensor = dest; if ( ofmSlice ) ofmConn->slice = *ofmSlice; - if ( ofmConn->tensor->dataType == DataType::Int64 ) + if ( ofmConn->Type() == DataType::Int64 ) { // Copy int64 data as int32 data with 2 x C by cloning destination tensor ofmConn->tensor = std::make_shared(*dest); - ofmConn->tensor->dataType = DataType::Int32; + ofmConn->SetType(DataType::Int32); ofmConn->tensor->storageShape = dest->storageShape.WithDepth(2 * dest->storageShape.Depth()); ofmConn->tensor->producers.clear(); if ( ofmSlice ) @@ -77,10 +76,11 @@ static std::unique_ptr MakeMemCopy(const std::shared_ptrtensor->producers.push_back(op.get()); auto ifmConn = op->ConnectInput(TensorUsage::IFM, source); - if ( ifmConn->tensor->dataType == DataType::Int64 ) + assert(ifmConn->Type() == ofmConn->Type()); + if ( ifmConn->Type() == DataType::Int64 ) { // Copy int64 data as int32 data with 2 x C by cloning source tensor ifmConn->tensor = std::make_shared(*source); - ifmConn->tensor->dataType = DataType::Int32; + ifmConn->SetType(DataType::Int32); ifmConn->tensor->storageShape = source->storageShape.WithDepth(2 * source->storageShape.Depth()); source->RemoveReader(op.get()); } @@ -92,9 +92,11 @@ static std::unique_ptr MakeMemCopy(const std::shared_ptr MakeTransposeOp( const std::shared_ptr &source, const std::shared_ptr &dest, const Shape &perm) { - assert(source->dataType == dest->dataType); assert(source->storageShape.Size() == perm.Size()); auto op = std::make_unique(OpType::Transpose); + auto ifmConn = op->AddInput(TensorUsage::IFM); + auto ofmConn = op->AddOutput(TensorUsage::OFM); + assert(ifmConn->Type() == ofmConn->Type()); auto kernel = Kernel({1, 1}, {1, 1}, {1, 1}); op->SetKernel(&kernel); @@ -102,25 +104,23 @@ static std::unique_ptr MakeTransposeOp( const auto attr = op->Attribute(); attr->perm = perm; - auto ifmConn = op->AddInput(TensorUsage::IFM); ifmConn->tensor = source; - if ( ifmConn->tensor->dataType == DataType::Int64 ) + if ( ifmConn->Type() == DataType::Int64 ) { // Read int64 data as int32 data with dimensions [..., C, 2] by cloning source tensor ifmConn->tensor = std::make_shared(*source); - ifmConn->tensor->dataType = DataType::Int32; + ifmConn->SetType(DataType::Int32); ifmConn->tensor->storageShape = source->storageShape.Insert(source->storageShape.Size(), 2); attr->perm = perm.Insert(perm.Size(), perm.Size()); // Update permutation with added dimenson } ifmConn->shape = ifmConn->tensor->storageShape; ifmConn->tensor->consumers.push_back(op.get()); - auto ofmConn = op->AddOutput(TensorUsage::OFM); ofmConn->transpose = TransposeTypeFromShape(attr->perm); ofmConn->tensor = dest; - if ( ofmConn->tensor->dataType == DataType::Int64 ) + if ( ofmConn->Type() == DataType::Int64 ) { // Write int64 data as int32, with dimensions from ifm ofmConn->tensor = std::make_shared(*dest); - ofmConn->tensor->dataType = DataType::Int32; + ofmConn->SetType(DataType::Int32); ofmConn->tensor->storageShape = ifmConn->shape.Permute(uint32_t(ofmConn->transpose)); ofmConn->tensor->producers.clear(); } @@ -191,8 +191,8 @@ static std::unique_ptr GetOpConfig(Architecture *arch, con { qConfig.ifmShape[1] = ifm1->SliceShape(); } - qConfig.ifmBits = DataTypeSizeBits(ifm->tensor->dataType); - qConfig.ofmBits = DataTypeSizeBits(ofm->tensor->dataType); + qConfig.ifmBits = DataTypeSizeBits(ifm->Type()); + qConfig.ofmBits = DataTypeSizeBits(ofm->Type()); qConfig.kernel = schedOp->Kernel(); qConfig.lutBytes = schedOp->TryInput(TensorUsage::LUT) ? 2048 : 0; qConfig.scaled = schedOp->HasScaling(); @@ -878,7 +878,7 @@ std::vector> DecomposeConv3D(Architecture *a auto acc = std::make_shared(); acc->uid = GenerateUniqueId(); acc->memArea = ofmConn->tensor->memArea; - acc->dataType = ifmConn->tensor->dataType == DataType::Int16 ? DataType::Int64 : DataType::Int32; + acc->dataType = ifmConn->Type() == DataType::Int16 ? DataType::Int64 : DataType::Int32; acc->storageShape = Shape(ofmShape, 4).WithBatch(1); // Create ifm zero point SchedulerTensor, only needed for broadcast // Setup is done below if needed @@ -929,7 +929,7 @@ std::vector> DecomposeConv3D(Architecture *a { // Setup SchedulerTensor for 0 input ifm0->uid = GenerateUniqueId(); - ifm0->dataType = subOpIfm->tensor->dataType; + ifm0->dataType = subOpIfm->Type(); ifm0->memArea = arch->ReadonlyMemory(); ifm0->format = TensorFormat::NHWC; const auto bufSize = ifm0shape.Elements(); diff --git a/ethosu/regor/compiler/scheduler_decompose.hpp b/ethosu/regor/compiler/scheduler_decompose.hpp index 256a57c1c89be7d6d6c0852f7dc6d740a1cfb08b..a872ee82e9555908802dd756257c946d2c6558d6 100644 --- a/ethosu/regor/compiler/scheduler_decompose.hpp +++ b/ethosu/regor/compiler/scheduler_decompose.hpp @@ -53,7 +53,7 @@ inline ArchFM &Set(ArchFM &fm, const SchedulerConnection *conn) { if ( conn ) { - fm.type = conn->tensor->dataType; + fm.type = conn->Type(); fm.shape = conn->SliceShape(); fm.format = conn->tensor->format; fm.quantization = conn->quantization; diff --git a/ethosu/regor/compiler/scheduler_operation.hpp b/ethosu/regor/compiler/scheduler_operation.hpp index 12976f0158f38041f3812e0662ea664f125dccbf..d88d9a4fd30a7fa31ef098a4bbcb49dce66c87a7 100644 --- a/ethosu/regor/compiler/scheduler_operation.hpp +++ b/ethosu/regor/compiler/scheduler_operation.hpp @@ -134,6 +134,10 @@ enum class Buffering /// struct SchedulerConnection { +private: + DataType dataType = DataType::None; + +public: std::shared_ptr tensor; Shape shape; TensorSlice slice; @@ -149,6 +153,8 @@ struct SchedulerConnection int PartialAllocationSizeBytes() const { return TensorAllocationBytes(shape, tensor->format, tensor->dataType); } const Shape &SliceShape() const { return slice.shape.IsEmpty() ? shape : slice.shape; } + void SetType(DataType dt) { dataType = dt; } + DataType Type() const { return dataType == DataType::None ? tensor->dataType : dataType; } }; enum class AccumulatorSource diff --git a/ethosu/regor/compiler/scheduler_packing.cpp b/ethosu/regor/compiler/scheduler_packing.cpp index 4b8419ec9955b0c5f4f52b6dcdc69a64428bb52d..956daf6c013b5a92c692e74e818ea06e54348e70 100644 --- a/ethosu/regor/compiler/scheduler_packing.cpp +++ b/ethosu/regor/compiler/scheduler_packing.cpp @@ -171,7 +171,7 @@ ArchitectureOpGroupQuery SchedulerPacking::CreateOpGroupQuery(const SchedulerOpe auto ifm1 = schedOp->TryIFM(1); auto ofm = schedOp->OFM(); query.ifm[0].key = ifm0->tensor->uid; - query.ifm[0].type = ifm0->tensor->dataType; + query.ifm[0].type = ifm0->Type(); query.ifm[0].shape = ifm0->SliceShape(); query.ifm[0].transpose = ifm0->transpose; query.ifm[0].reverse = ifm0->reverse; @@ -179,14 +179,14 @@ ArchitectureOpGroupQuery SchedulerPacking::CreateOpGroupQuery(const SchedulerOpe if ( ifm1 ) { query.ifm[1].key = ifm1->tensor->uid; - query.ifm[1].type = ifm1->tensor->dataType; + query.ifm[1].type = ifm1->Type(); query.ifm[1].shape = ifm1->SliceShape(); query.ifm[1].transpose = ifm1->transpose; query.ifm[1].reverse = ifm1->reverse; query.ifm[1].isConst = ifm1->tensor->IsConstant(); } query.ofm.key = ofm->tensor->uid; - query.ofm.type = ofm->tensor->dataType; + query.ofm.type = ofm->Type(); query.ofm.shape = ofm->SliceShape(); query.ofm.transpose = ofm->transpose; query.ofm.reverse = ofm->reverse; @@ -286,6 +286,7 @@ void SchedulerPacking::SchedulerPacking::PackOperations() { auto *ofmConn = primaryOp->OFM(); ofmConn->tensor = nextOp->OFM()->tensor; + ofmConn->SetType(nextOp->OFM()->Type()); ofmConn->quantization.quantMin = nextOp->Output(TensorUsage::OFM)->quantization.quantMin; ofmConn->quantization.quantMax = nextOp->Output(TensorUsage::OFM)->quantization.quantMax; } @@ -293,6 +294,7 @@ void SchedulerPacking::SchedulerPacking::PackOperations() { auto *ofmConn = primaryOp->OFM(); ofmConn->tensor = nextOp->OFM()->tensor; + ofmConn->SetType(nextOp->OFM()->Type()); ofmConn->shape = nextOp->OFM()->shape; ofmConn->transpose = nextOp->OFM()->transpose; } @@ -300,6 +302,7 @@ void SchedulerPacking::SchedulerPacking::PackOperations() { auto *ofmConn = primaryOp->OFM(); ofmConn->tensor = nextOp->OFM()->tensor; + ofmConn->SetType(nextOp->OFM()->Type()); ofmConn->shape = nextOp->OFM()->shape; ofmConn->reverse = nextOp->OFM()->reverse; } @@ -474,6 +477,7 @@ void SchedulerPacking::InitSchedulerConnection( schedConn->reverse = conn.reverse; schedConn->resamplingMode = ArchResampling::None; schedConn->rounding = conn.rounding; + schedConn->SetType(tensor->dataType); if ( schedConn->slice.stride ) { schedConn->stepXY = schedConn->slice.stride.WH();