diff --git a/ethosu/regor/architecture/architecture.hpp b/ethosu/regor/architecture/architecture.hpp
index c188d83384286aeb9ffbca455d8c219583459328..02a6b7e299d03526cbcff03fbdc01664233aec87 100644
--- a/ethosu/regor/architecture/architecture.hpp
+++ b/ethosu/regor/architecture/architecture.hpp
@@ -186,9 +186,6 @@ public:
     virtual ~ArchitectureOpGroup() = default;
     virtual int Add(const ArchitectureOpGroupQuery &op, const std::vector<int> &dependsOn = {}) = 0;
     virtual bool NeedsAllocation(UniqueId tensorUID) = 0;
-
-protected:
-    virtual bool CanRunOnNPU(const ArchitectureOpGroupQuery &op) = 0;
 };
 
 enum class ArchAccumulatorSource : uint8_t
diff --git a/ethosu/regor/architecture/architecture_constraints.hpp b/ethosu/regor/architecture/architecture_constraints.hpp
index e08b1c24dae038b48336832d57d990406bdb2004..adad6f59962dd5bff74575320bf0ac30b87b9868 100644
--- a/ethosu/regor/architecture/architecture_constraints.hpp
+++ b/ethosu/regor/architecture/architecture_constraints.hpp
@@ -51,16 +51,29 @@ struct ArchOperatorQuery
     ArchFM ofm;
     ReverseType reverseMask = ReverseType::None;
     TransposeType transposeMask = TransposeType::None;
+    Kernel *kernel = nullptr;
     ~ArchOperatorQuery(){};
 };
 
 enum class ArchRequirement
 {
     None = 0,
-    ScratchTensor = 1,
-    OpSubstitution = 2,
-    OutputFormat = 4,
-    InputFormat = 8,
+    ScratchTensor = 1 << 0,
+    OutputFormat = 1 << 1,
+    InputFormat = 1 << 2,
+    OpSubstitution = 1 << 3,
+    Decompose = 1 << 4,
+};
+
+enum class ArchProperty
+{
+    None = 0,
+    TensorAxis = 1 << 0,
+    TensorDims = 1 << 1,
+    KernelStride = 1 << 2,
+    KernelDilation = 1 << 3,
+    DepthMultiplier = 1 << 4,
+    TransposeMask = 1 << 5,
 };
 
 struct ArchRequirements
@@ -76,6 +89,7 @@ struct ArchRequirements
     TensorFormat ifm1Format = TensorFormat::Unknown;
     TensorFormat ofmFormat = TensorFormat::Unknown;
     OpType substitution = OpType::None;
+    Flags<ArchProperty> decomposeProps;
 };
 
 enum class TransposeSupport
@@ -98,10 +112,8 @@ enum class QueryResult
     Native = 2,
     Constrained = 4,
     HasRequirements = 8,
-    Decompose = 16,
     NativeHasReq = Native | HasRequirements,
     NativeConstrained = Native | Constrained,
-    NativeDecompose = Native | Decompose,
     NativeConstrainedHasReq = Native | Constrained | HasRequirements,
 };
 
@@ -112,15 +124,18 @@ class IArchitectureConstraints
 {
 public:
     virtual ~IArchitectureConstraints() = default;
-    virtual bool SupportsFusedReverse(OpType opType, ReverseType reverseTypeMask) = 0;
     virtual bool SupportsFusedRescale(OpType opType, TensorUsage tensorUsage, DataType rescaleFromType,
         DataType rescaleToType, DataType opFromType, DataType opToType, const Quantization &quantization) = 0;
-    virtual TransposeSupport SupportsFusedTranspose(OpType opType, TransposeType transposeType) = 0;
     virtual bool SupportsAccumulatorSaveRestore() = 0;
     virtual bool SupportsNegativeStrides() = 0;
     virtual bool SupportsElementwiseLeakyRelu(bool quantized, DataType type) = 0;
     virtual bool SupportsRescale(DataType fromType, DataType toType) = 0;
     virtual Flags<QueryResult> OperatorQuery(OpType opType, const ArchOperatorQuery *query = nullptr, ArchRequirements *req = nullptr) = 0;
+
+private:
+    virtual bool SupportedDtypes(OpType opType, DataType ifmType, DataType ifm2Type, DataType ofmType) = 0;
+    virtual bool SupportsFusedReverse(OpType opType, ReverseType reverseTypeMask) = 0;
+    virtual TransposeSupport SupportsFusedTranspose(OpType opType, TransposeType transposeType) = 0;
 };
 
 }  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55.cpp
index 39c8112043116ac64d1cb038050bc13ae7dee0eb..39c486daf7b7c842b9019a203618e8254ad3b3b0 100644
--- a/ethosu/regor/architecture/ethosu55/ethos_u55.cpp
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55.cpp
@@ -98,9 +98,6 @@ static const int s_SHRAMElementBits[] = {
 
 static_assert(std::size(s_SHRAMElementBits) == int(SHRAM_Last) + 1, "Bad element mapping");
 
-// max size for tensor axes
-const static Shape MAX_SHAPE(nullptr, 8, 65536);
-
 ArchEthosU55::ArchEthosU55() : _subkernelMax(8, 8, 65536), _ofmBlockMax(32, 64, 128)
 {
     _weightEncoder = std::make_unique<EthosU55WeightEncoder>(this);
@@ -742,12 +739,6 @@ int EthosU55OpGroup::Add(const ArchitectureOpGroupQuery &op, const std::vector<i
         }
     }
 
-    if ( !CanRunOnNPU(op) )
-    {
-        // Can only fuse NPU ops
-        return 0;
-    }
-
     if ( _opsCount > 0 && !IsActivation(op.type) )
     {
         // Can only fuse with activation
@@ -800,178 +791,4 @@ bool EthosU55OpGroup::NeedsAllocation(UniqueId tensorUID)
     return _fusedTensors.count(tensorUID) == 0;
 }
 
-// Table of allowed ifm/ofm data type combinations for each HWOp
-static const std::unordered_map<EthosU55NpuOp, std::unordered_map<DataType, std::vector<DataType>>> s_opDataTypeSupport = {
-    {EthosU55NpuOp::Convolution,  // HWOp
-        {
-            // IFM data type  | OFM data type(s)
-            {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
-            {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
-            {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
-        }},
-    {EthosU55NpuOp::Depthwise,
-        {
-            {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
-            {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
-            {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
-        }},
-    {EthosU55NpuOp::VectorProduct,
-        {
-            {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
-            {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
-            {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
-        }},
-    {EthosU55NpuOp::Pooling,
-        {
-            {DataType::UInt8, {DataType::UInt8}},
-            {DataType::Int8, {DataType::Int8}},
-            {DataType::Int16, {DataType::Int16}},
-        }},
-    {EthosU55NpuOp::ReduceSum,
-        {
-            {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
-            {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
-            {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
-            {DataType::Int32, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
-        }},
-};
-
-bool EthosU55OpGroup::CanRunOnNPU(const ArchitectureOpGroupQuery &op)
-{
-    EthosU55NpuOp npuOp = ArchEthosU55::GetHWOp(op.type);
-
-    if ( IsFloat(op.ifm[0].type | op.ifm[1].type | op.ofm.type) )
-    {
-        return false;
-    }
-
-    if ( npuOp == EthosU55NpuOp::None || npuOp > EthosU55NpuOp::Compound )
-    {
-        return false;
-    }
-
-    auto k = op.kernel;
-    if ( k->Stride().x > 3 || k->Stride().y > 3 )
-    {
-        return false;
-    }
-
-    if ( k->Dilation().x > 2 || k->Dilation().y > 2 )
-    {
-        return false;
-    }
-
-    if ( k->DepthMultiplier() > 1 )
-    {
-        return false;
-    }
-
-    // Validate that input/outputs shapes don't overflow
-    if ( npuOp != EthosU55NpuOp::Dma )
-    {
-        const auto &ifmShape = op.ifm[0].shape;
-        const auto &ofmShape = op.ofm.shape;
-        if ( ifmShape.GreaterMask(MAX_SHAPE) != 0 )
-        {
-            return false;
-        }
-        if ( ofmShape.GreaterMask(MAX_SHAPE) != 0 )
-        {
-            return false;
-        }
-        if ( op.inputs > 1 )
-        {
-            const auto &ifm2Shape = op.ifm[1].shape;
-            if ( ifm2Shape.GreaterMask(MAX_SHAPE) != 0 )
-            {
-                return false;
-            }
-        }
-    }
-
-    // Check allowed ifm/ofm type mapping
-    if ( npuOp != EthosU55NpuOp::Elementwise )
-    {
-        if ( op.type == OpType::LUT || op.type == OpType::MemoryCopy || op.type == OpType::Rescale ||
-             op.type == OpType::Tile || op.type == OpType::Transpose || npuOp == EthosU55NpuOp::Compound )
-        {  // TODO: LUT operations end up here due to UseAvgPoolNop although the rules are not the same as
-           // for a Pooling operation, so skip checks for now.
-            return true;
-        }
-
-        auto map = s_opDataTypeSupport.find(npuOp);
-        if ( map == s_opDataTypeSupport.end() )
-        {
-            assert(false && "Data type mapping for HWOp missing");
-            return false;
-        }
-        auto &typeMap = map->second;
-        auto ifmEntry = typeMap.find(op.ifm[0].type);
-        if ( ifmEntry == typeMap.end() )
-        {  // Unsupported ifm data type
-            return false;
-        }
-        auto &ofmTypes = ifmEntry->second;
-        if ( 0 == std::count(ofmTypes.begin(), ofmTypes.end(), op.ofm.type) )
-        {  // Unsupported ofm data type
-            return false;
-        }
-    }
-    else
-    {
-        std::vector<DataType> validIfmTypes;
-        std::vector<DataType> validOfmTypes;
-        switch ( op.type )
-        {
-            case OpType::Add:
-            case OpType::Sub:
-            case OpType::Mul:
-            {
-                validIfmTypes = {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32};
-                validOfmTypes = validIfmTypes;
-            }
-            break;
-            case OpType::Minimum:
-            case OpType::Maximum:
-            case OpType::LeakyRelu:
-            case OpType::Abs:
-            {
-                validIfmTypes = {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32};
-                validOfmTypes = {op.ifm[0].type};
-            }
-            break;
-            case OpType::CLZ:
-            case OpType::SHL:
-            case OpType::Asr:
-            {
-                validIfmTypes = {DataType::Int32};
-                validOfmTypes = {DataType::Int32};
-                if ( op.type == OpType::Asr )
-                {
-                    validOfmTypes.insert(validOfmTypes.begin(), {DataType::UInt8, DataType::Int8, DataType::Int16});
-                }
-            }
-            break;
-            default:
-                assert(false && "Unkown elementwise type");
-                break;
-        }
-
-        if ( 0 == std::count(validIfmTypes.begin(), validIfmTypes.end(), op.ifm[0].type) )
-        {  // Unsupported ifm data type
-            return false;
-        }
-        if ( IsBinaryElementwise(op.type) && op.ifm[1].type != op.ifm[0].type )
-        {  // ifm2 data type must match ifm data type
-            return false;
-        }
-        if ( 0 == std::count(validOfmTypes.begin(), validOfmTypes.end(), op.ofm.type) )
-        {  // Unsupported ofm data type
-            return false;
-        }
-    }
-
-    return true;
-}
-
 }  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55.hpp b/ethosu/regor/architecture/ethosu55/ethos_u55.hpp
index f8e606449b8ed7fe8921b18ca43b86f560f91fe8..5b3de60e9b7873266265778e7ba5725eebe6b244 100644
--- a/ethosu/regor/architecture/ethosu55/ethos_u55.hpp
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55.hpp
@@ -136,9 +136,6 @@ private:
 public:
     int Add(const ArchitectureOpGroupQuery &op, const std::vector<int> &dependsOn = {}) override;
     bool NeedsAllocation(UniqueId TensorUID) override;
-
-protected:
-    bool CanRunOnNPU(const ArchitectureOpGroupQuery &op) override;
 };
 
 /// <summary>
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp
index 7f2101c96d6779ba33a0a160dacd9a8fc107b61f..03fd3d7b918d175f5accf79ca9e611f742f8fe9a 100644
--- a/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp
@@ -23,27 +23,42 @@
 namespace regor
 {
 
-// Unsupported operators - must be sorted ascending
-static constexpr OpType s_unsupportedU55[] = {
-    OpType::None,
-    OpType::ArgMax,
-    OpType::Not,
-    OpType::Gather,
-    OpType::Scatter,
-    OpType::Resize,
-    OpType::Cast,
-};
-
-static_assert(is_sorted(s_unsupportedU55), "list must be sorted");
-
-// Short query
-static constexpr std::pair<OpType, QueryResult> s_shortU55[] = {
-    {OpType::Transpose, QueryResult::NativeConstrained},
+// Table of allowed ifm/ofm data type combinations for each HWOp
+static const std::unordered_map<EthosU55NpuOp, std::unordered_map<DataType, std::vector<DataType>>> s_opDataTypeSupport = {
+    {EthosU55NpuOp::Convolution,  // HWOp
+        {
+            // IFM data type  | OFM data type(s)
+            {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+            {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+            {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+        }},
+    {EthosU55NpuOp::Depthwise,
+        {
+            {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+            {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+            {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+        }},
+    {EthosU55NpuOp::VectorProduct,
+        {
+            {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+            {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+            {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+        }},
+    {EthosU55NpuOp::Pooling,
+        {
+            {DataType::UInt8, {DataType::UInt8}},
+            {DataType::Int8, {DataType::Int8}},
+            {DataType::Int16, {DataType::Int16}},
+        }},
+    {EthosU55NpuOp::ReduceSum,
+        {
+            {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+            {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+            {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+            {DataType::Int32, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+        }},
 };
 
-static_assert(is_sorted(s_shortU55, [](const auto &a, const auto &b) { return a.first < b.first; }), "list must be sorted");
-
-
 EthosU55Constraints::EthosU55Constraints(ArchEthosU55 *arch) : _arch(arch)
 {
 }
@@ -157,40 +172,192 @@ bool EthosU55Constraints::SupportsRescale(DataType fromType, DataType toType)
     return true;
 }
 
+bool EthosU55Constraints::SupportedDtypes(OpType opType, DataType ifmType, DataType ifm2Type, DataType ofmType)
+{
+    auto npuOp = _arch->GetHWOp(opType);
+    if ( IsFloat(ifmType | ifm2Type | ofmType) )
+    {
+        return false;
+    }
+
+    if ( _arch->UseAvgPoolNop(opType) )
+    {
+        // The rules for UseAvgPoolNop are not the same as for a Pooling operation, so skip checks for now
+        return true;
+    }
+
+    if ( npuOp == EthosU55NpuOp::Compound || npuOp == EthosU55NpuOp::Dma )
+    {
+        return true;
+    }
+
+    // Check allowed ifm/ofm type mapping
+    if ( npuOp != EthosU55NpuOp::Elementwise )
+    {
+        auto map = s_opDataTypeSupport.find(npuOp);
+        if ( map == s_opDataTypeSupport.end() )
+        {
+            assert(false && "Data type mapping for HWOp missing");
+            return false;
+        }
+        auto &typeMap = map->second;
+        auto ifmEntry = typeMap.find(ifmType);
+        if ( ifmEntry == typeMap.end() )
+        {  // Unsupported ifm data type
+            return false;
+        }
+        auto &ofmTypes = ifmEntry->second;
+        if ( 0 == std::count(ofmTypes.begin(), ofmTypes.end(), ofmType) )
+        {  // Unsupported ofm data type
+            return false;
+        }
+    }
+    else
+    {
+        std::vector<DataType> validIfmTypes;
+        std::vector<DataType> validOfmTypes;
+        switch ( opType )
+        {
+            case OpType::Add:
+            case OpType::Sub:
+            case OpType::Mul:
+            {
+                validIfmTypes = {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32};
+                validOfmTypes = validIfmTypes;
+            }
+            break;
+            case OpType::Minimum:
+            case OpType::Maximum:
+            case OpType::LeakyRelu:
+            case OpType::Abs:
+            {
+                validIfmTypes = {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32};
+                validOfmTypes = {ifmType};
+            }
+            break;
+            case OpType::CLZ:
+            case OpType::SHL:
+            case OpType::Asr:
+            {
+                validIfmTypes = {DataType::Int32};
+                validOfmTypes = {DataType::Int32};
+                if ( opType == OpType::Asr )
+                {
+                    validOfmTypes.insert(validOfmTypes.begin(), {DataType::UInt8, DataType::Int8, DataType::Int16});
+                }
+            }
+            break;
+            default:
+                assert(false && "Unkown elementwise type");
+                break;
+        }
+        if ( 0 == std::count(validIfmTypes.begin(), validIfmTypes.end(), ifmType) )
+        {  // Unsupported ifm data type
+            return false;
+        }
+        if ( IsBinaryElementwise(opType) && ifm2Type != ifmType )
+        {  // ifm2 data type must match ifm data type
+            return false;
+        }
+        if ( 0 == std::count(validOfmTypes.begin(), validOfmTypes.end(), ofmType) )
+        {  // Unsupported ofm data type
+            return false;
+        }
+    }
+    return true;
+}
+
 Flags<QueryResult> EthosU55Constraints::OperatorQuery(OpType opType, const ArchOperatorQuery *query, ArchRequirements *req)
 {
-    // Check unsupported operator list before further checks
-    auto posUnsupported = std::equal_range(std::begin(s_unsupportedU55), std::end(s_unsupportedU55), opType);
-    if ( posUnsupported.first != posUnsupported.second )
+    Flags<QueryResult> result = QueryResult::Native;
+    static constexpr int32_t MAX_AXIS = (1 << 16);
+
+    // Check hardware-required substitutions first
+    if ( (opType == OpType::Sigmoid) || (opType == OpType::Tanh) )
+    {
+        if ( query->ifm[0].type != DataType::Int16 )
+        {
+            if ( req )
+            {
+                req->req = ArchRequirement::OpSubstitution;
+                req->substitution = OpType::LUT;
+            }
+            result.Set(QueryResult::HasRequirements);
+        }
+    }
+
+    // Check direct native support of the opType
+    auto npuOp = _arch->GetHWOp(opType);
+    if ( npuOp == EthosU55NpuOp::None )
     {
         return QueryResult::Unsupported;
     }
+    else if ( npuOp == EthosU55NpuOp::Dma )
+    {
+        return result;
+    }
 
     // Short query (no additional detail)
     if ( !query )
     {
-        auto posShort = std::equal_range(std::begin(s_shortU55), std::end(s_shortU55),
-            std::pair<OpType, QueryResult>{opType, {}}, [](const auto &a, const auto &b) { return a.first < b.first; });
-        if ( posShort.first != posShort.second )
-        {
-            return posShort.first->second;
-        }
-        return QueryResult::Native;
+        // more detailed query might fail
+        return QueryResult::NativeConstrained;
     }
 
-    // Float types always unsupported
-    if ( (query->ifm[0].shape && IsFloat(query->ifm[0].type)) || (query->ifm[1].shape && IsFloat(query->ifm[1].type)) ||
-         (query->ofm.shape && IsFloat(query->ofm.type)) )
+    const auto &ifmShape = query->ifm[0].shape;
+    const auto &ifm2Shape = query->ifm[1].shape;
+    const auto &ofmShape = query->ofm.shape;
+    bool typeInfo = (query->ifm[0].type != DataType::None && query->ofm.type != DataType::None);
+    bool shapeInfo = (ifmShape && ofmShape);
+
+    if ( !typeInfo || !shapeInfo || !query->kernel )
     {
-        return QueryResult::Unsupported;
+        // missing detail, more detailed queries might fail
+        result.Set(QueryResult::Constrained);
     }
 
-    // Reverse never supported
-    if ( query->reverseMask != ReverseType::None )
+    // Validate DataTypes
+    if ( typeInfo && !SupportedDtypes(opType, query->ifm[0].type, query->ifm[1].type, query->ofm.type) )
     {
         return QueryResult::Unsupported;
     }
 
+    // Validate tensor-shapes
+    if ( shapeInfo )
+    {
+        for ( const auto &s : {ifmShape, ifm2Shape, ofmShape} )
+        {
+            if ( !s ) continue;
+            auto shape = Shape::PadAxes(s, 4, 1);
+            // validate that leading dimensions are unit
+            for ( int i = 0; i < shape.Size() - 3; i++ )
+            {
+                if ( shape[i] > 1 )
+                {
+                    if ( req )
+                    {
+                        req->req.Set(ArchRequirement::Decompose);
+                        req->decomposeProps.Set(ArchProperty::TensorDims);
+                    }
+                    result.Set(QueryResult::HasRequirements);
+                }
+            }
+            // validate that HWC are within valid range
+            for ( int i = shape.Size() - 3; i < shape.Size(); i++ )
+            {
+                if ( shape[i] > MAX_AXIS )
+                {
+                    if ( req )
+                    {
+                        req->req.Set(ArchRequirement::Decompose);
+                        req->decomposeProps.Set(ArchProperty::TensorAxis);
+                    }
+                    result.Set(QueryResult::HasRequirements);
+                }
+            }
+        }
+    }
+
     // Detailed operator queries
     if ( !IsNone(query->transposeMask) )
     {
@@ -205,9 +372,27 @@ Flags<QueryResult> EthosU55Constraints::OperatorQuery(OpType opType, const ArchO
                     req->ifmFormat = TensorFormat::NHWC;
                     req->ofmFormat = TensorFormat::NHWC;
                 }
-                return QueryResult::NativeConstrainedHasReq;
+                result.Set(QueryResult::HasRequirements);
+            }
+            else
+            {
+                // supported with decomposition requirements
+                if ( req )
+                {
+                    req->req.Set(ArchRequirement::Decompose);
+                    req->decomposeProps.Set(ArchProperty::TransposeMask);
+                }
+                result.Set(QueryResult::HasRequirements);
             }
         }
+        else
+        {
+            return QueryResult::Unsupported;
+        }
+    }
+
+    if ( query->reverseMask != ReverseType::None )
+    {
         return QueryResult::Unsupported;
     }
 
@@ -226,22 +411,50 @@ Flags<QueryResult> EthosU55Constraints::OperatorQuery(OpType opType, const ArchO
             req->ifm1Format = TensorFormat::NHWC;  // IFM1 and OFM are depth-sliced
             req->ofmFormat = TensorFormat::NHWC;   // and cannot be addressed if B16
         }
-        return QueryResult::NativeHasReq;
+        result.Set(QueryResult::HasRequirements);
     }
-    else if ( (opType == OpType::Sigmoid) || (opType == OpType::Tanh) )
+
+    // kernel constraint-checks
+    if ( query->kernel )
     {
-        if ( query->ifm[0].type != DataType::Int16 )
+        auto k = query->kernel;
+        if ( k->Stride().x > 3 || k->Stride().y > 3 )
         {
             if ( req )
             {
-                req->req = ArchRequirement::OpSubstitution;
-                req->substitution = OpType::LUT;
+                req->req.Set(ArchRequirement::Decompose);
+                req->decomposeProps.Set(ArchProperty::KernelStride);
             }
-            return QueryResult::NativeHasReq;
+            result.Set(QueryResult::HasRequirements);
+        }
+
+        if ( k->Dilation().x > 2 || k->Dilation().y > 2 )
+        {
+            if ( req )
+            {
+                req->req.Set(ArchRequirement::Decompose);
+                req->decomposeProps.Set(ArchProperty::KernelDilation);
+            }
+            result.Set(QueryResult::HasRequirements);
+        }
+
+        if ( k->DepthMultiplier() > 1 )
+        {
+            if ( req )
+            {
+                req->req.Set(ArchRequirement::Decompose);
+                req->decomposeProps.Set(ArchProperty::DepthMultiplier);
+            }
+            result.Set(QueryResult::HasRequirements);
         }
     }
-    return QueryResult::Native;
-}
+    else
+    {
+        // no kernel provided, more detailed queries might fail
+        result.Set(QueryResult::Constrained);
+    }
 
+    return result;
+}
 
 }  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.hpp b/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.hpp
index a15b3f47181605ebd75466048c2a14a84705b74a..045c18a5ebfacc73b68389e027d5c102d8084ad0 100644
--- a/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.hpp
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.hpp
@@ -30,15 +30,18 @@ private:
 public:
     EthosU55Constraints(ArchEthosU55 *arch);
 
-    bool SupportsFusedReverse(OpType opType, ReverseType reverseTypeMask) override;
     bool SupportsFusedRescale(OpType opType, TensorUsage tensorUsage, DataType rescaleFromType, DataType rescaleToType,
         DataType opFromType, DataType opToType, const Quantization &quantization) override;
-    TransposeSupport SupportsFusedTranspose(OpType opType, TransposeType transposeType) override;
     bool SupportsAccumulatorSaveRestore() override { return false; }
     bool SupportsNegativeStrides() override { return true; };
     bool SupportsElementwiseLeakyRelu(bool quantized, DataType type) override;
     bool SupportsRescale(DataType fromType, DataType toType) override;
     Flags<QueryResult> OperatorQuery(OpType opType, const ArchOperatorQuery *query, ArchRequirements *req) override;
+
+protected:
+    bool SupportedDtypes(OpType opType, DataType ifmType, DataType ifm2Type, DataType ofmType) override;
+    bool SupportsFusedReverse(OpType opType, ReverseType reverseTypeMask) override;
+    TransposeSupport SupportsFusedTranspose(OpType opType, TransposeType transposeType) override;
 };
 
 }  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85.cpp
index 5ff354547bd5d3da4695882caf737ffdfd1cae8b..3312c0ba8a491a1e4ece1bec26d89f355251a806 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85.cpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85.cpp
@@ -83,8 +83,6 @@ static const ArchEthosU85::AcceleratorConfig s_EthosU85Configs[] = {
 
 constexpr int CB_SLOTS = 6;
 constexpr int BRICK_ELEMENTS = 16;
-// max size for tensors
-const static Shape MAX_SHAPE(nullptr, 8, 65536);
 constexpr int ACC_DEPTH_GRANULE = 16;  // Accumulator depth granularity
 
 enum class ElementwiseUsage
@@ -243,7 +241,8 @@ Flags<WeightFormat> ArchEthosU85::SupportedWeightFormat(OpType op)
 
 bool ArchEthosU85::UseAvgPoolNop(OpType type)
 {
-    return IsActivation(type) || type == OpType::Quantize || type == OpType::MemoryCopy || type == OpType::Transpose || type == OpType::Reverse;
+    return IsActivation(type) || type == OpType::Quantize || type == OpType::MemoryCopy || type == OpType::Transpose ||
+           type == OpType::Reverse || type == OpType::Rescale;
 }
 
 bool ArchEthosU85::UseNullPool(OpType opType, int bits)
@@ -1339,12 +1338,10 @@ EthosU85NpuOp ArchEthosU85::GetHWOp(OpType type)
         {OpType::ReduceMax, EthosU85NpuOp::ReduceMinMax},
         {OpType::ReduceAny, EthosU85NpuOp::ReduceMinMax},
         {OpType::ReduceAll, EthosU85NpuOp::ReduceMinMax},
-        // TODO MLBEDSW-7986 add none pooling
         {OpType::Resize, EthosU85NpuOp::Resize},
         {OpType::Gather, EthosU85NpuOp::Dma},
         {OpType::Scatter, EthosU85NpuOp::Dma},
         {OpType::Tile, EthosU85NpuOp::Dma},
-        {OpType::Rescale, EthosU85NpuOp::Pooling},
     };
 
     auto pos = toNpuOp.find(type);
@@ -1580,12 +1577,6 @@ int EthosU85OpGroup::Add(const ArchitectureOpGroupQuery &op, const std::vector<i
 {
     int externalInputs = ExternalIfms(op);
 
-    if ( !CanRunOnNPU(op) )
-    {
-        // Can only fuse NPU ops
-        return 0;
-    }
-
     if ( _opsCount == 0 )
     {
         _supportsChaining = CanStartChain(op);
@@ -1646,185 +1637,4 @@ bool EthosU85OpGroup::NeedsAllocation(UniqueId tensorUID)
     return !IsChained(tensorUID) && !IsFused(tensorUID);
 }
 
-// TODO: This table is from the EthosU55/U65 Embedded NPU Interface Specification, it's not completely valid for
-// Ethos U85 since the allowed data types depend on ifm/ofm as well as selected acc and scaling.
-static const std::unordered_map<EthosU85NpuOp, std::unordered_map<DataType, std::vector<DataType>>> s_opDataTypeSupport = {
-    {EthosU85NpuOp::Convolution,
-        {
-            {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
-            {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
-            {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
-        }},
-    {EthosU85NpuOp::Depthwise,
-        {
-            {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
-            {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
-            {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
-        }},
-    {EthosU85NpuOp::VectorProduct,
-        {
-            {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
-            {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
-            {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
-        }},
-    {EthosU85NpuOp::Pooling,
-        {
-            {DataType::Bool8, {DataType::Bool8, DataType::Int32, DataType::Int64}},
-            {DataType::UInt8, {DataType::UInt8, DataType::Int32, DataType::Int64}},
-            {DataType::Int8, {DataType::Int8, DataType::Int32, DataType::Int64}},
-            {DataType::Int16, {DataType::Int16}},
-        }},
-    {EthosU85NpuOp::ReduceMinMax,
-        {
-            {DataType::Bool8, {DataType::Bool8, DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
-            {DataType::UInt8, {DataType::Bool8, DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
-            {DataType::Int8, {DataType::Bool8, DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
-            {DataType::Int16, {DataType::Bool8, DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
-            {DataType::Int32, {DataType::Bool8, DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
-        }},
-    {EthosU85NpuOp::ReduceSum,
-        {
-            {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
-            {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
-            {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
-            {DataType::Int32, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
-        }},
-    {EthosU85NpuOp::ArgMax,
-        {
-            {DataType::Bool8, {DataType::Int32, DataType::Int64}},
-            {DataType::UInt8, {DataType::Int32, DataType::Int64}},
-            {DataType::Int8, {DataType::Int32, DataType::Int64}},
-            {DataType::Int16, {DataType::Int32, DataType::Int64}},
-        }},
-    {EthosU85NpuOp::Dma,
-        {
-            {DataType::Bool8, {DataType::Bool8}},
-            {DataType::UInt8, {DataType::UInt8}},
-            {DataType::Int8, {DataType::Int8}},
-            {DataType::Int16, {DataType::Int16}},
-            {DataType::Int32, {DataType::Int32}},
-        }},
-    {EthosU85NpuOp::Resize,
-        {
-            {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
-            {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
-            {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
-        }},
-};
-
-bool EthosU85OpGroup::CanRunOnNPU(const ArchitectureOpGroupQuery &op)
-{
-    EthosU85NpuOp npuOp = ArchEthosU85::GetHWOp(op.type);
-
-    if ( IsFloat(op.ifm[0].type | op.ifm[1].type | op.ofm.type) )
-    {
-        return false;
-    }
-
-    if ( npuOp == EthosU85NpuOp::None )
-    {
-        return false;
-    }
-
-    auto k = op.kernel;
-    if ( k->Stride().x > 3 || k->Stride().y > 3 )
-    {
-        return false;
-    }
-
-    if ( k->Dilation().x > 2 || k->Dilation().y > 2 )
-    {
-        return false;
-    }
-
-    if ( k->DepthMultiplier() > 1 )
-    {
-        return false;
-    }
-
-    switch ( npuOp )
-    {
-        case EthosU85NpuOp::Convolution:
-        case EthosU85NpuOp::Depthwise:
-        case EthosU85NpuOp::VectorProduct:
-        case EthosU85NpuOp::Pooling:
-        case EthosU85NpuOp::ReduceMinMax:
-        case EthosU85NpuOp::ReduceSum:
-        case EthosU85NpuOp::ArgMax:
-        case EthosU85NpuOp::Elementwise:
-        case EthosU85NpuOp::Resize:
-        case EthosU85NpuOp::Dma:
-            break;
-        default:
-            assert(false && "Unrecognized HWOp");
-            return false;
-    }
-
-    // Validate that input/outputs shapes don't overflow
-    if ( npuOp != EthosU85NpuOp::Dma )
-    {
-        const auto &ifmShape = op.ifm[0].shape;
-        const auto &ofmShape = op.ofm.shape;
-
-        if ( ifmShape.GreaterMask(MAX_SHAPE) != 0 )
-        {
-            return false;
-        }
-        if ( ofmShape.GreaterMask(MAX_SHAPE) != 0 )
-        {
-            return false;
-        }
-        if ( op.inputs > 1 )
-        {
-            const auto &ifm2Shape = op.ifm[1].shape;
-            if ( ifm2Shape.GreaterMask(MAX_SHAPE) != 0 )
-            {
-                return false;
-            }
-        }
-    }
-
-    // Check allowed ifm/ofm data type mapping
-    if ( npuOp != EthosU85NpuOp::Elementwise )
-    {
-        if ( op.type == OpType::LUT || op.type == OpType::MemoryCopy || op.type == OpType::Rescale || op.type == OpType::Tile )
-        {  // TODO: LUT operations end up here due to UseAvgPoolNop although the rules are not the same as
-           // for a Pooling operation, so skip checks for now.
-            return true;
-        }
-
-        if ( op.type == OpType::Transpose || op.type == OpType::Reverse )
-        {
-            ArchOperatorQuery query;
-            query.transposeMask = op.ofm.transpose;
-            query.reverseMask = op.ofm.reverse;
-            return _arch->_constraints->OperatorQuery(OpType::MemoryCopy, &query, nullptr).Any(QueryResult::Native);
-        }
-
-        auto map = s_opDataTypeSupport.find(npuOp);
-        if ( map == s_opDataTypeSupport.end() )
-        {
-            assert(false && "Data type mapping for HWOp missing");
-            return false;
-        }
-        auto &typeMap = map->second;
-        auto ifmEntry = typeMap.find(op.ifm[0].type);
-        if ( ifmEntry == typeMap.end() )
-        {  // Unsupported ifm data type
-            return false;
-        }
-        auto &ofmTypes = ifmEntry->second;
-        if ( 0 == std::count(ofmTypes.begin(), ofmTypes.end(), op.ofm.type) )
-        {  // Unsupported ofm data type
-            return false;
-        }
-    }
-    else
-    {
-        // TODO: Elementwise
-    }
-
-    return true;
-}
-
 }  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85.hpp
index c4cbbad2d695631ae7d04b0c3c32468a158b3301..d23e64f74ab020a9794fb5a580a336c9ed94195e 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85.hpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85.hpp
@@ -136,7 +136,6 @@ public:
     bool NeedsAllocation(UniqueId tensorUID) override;
 
 protected:
-    bool CanRunOnNPU(const ArchitectureOpGroupQuery &op) override;
     int ChainingBuffer(UniqueId tensorUID);
     bool IsChained(UniqueId tensorUID);
     bool IsFused(UniqueId tensorUID);
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp
index 535ede5ca4e0390d4d75603a34a3fe4b125ff5cb..8a2ea87c076fe4331afe72f01a451dc8db5aad85 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp
@@ -21,22 +21,69 @@
 #include "ethos_u85.hpp"
 #include "ethos_u85_register_cs_generator.hpp"
 
+#include <unordered_map>
+
 namespace regor
 {
 
-// Unsupported operators - must be sorted ascending
-static constexpr OpType s_unsupportedU85[] = {OpType::None};
-
-static_assert(is_sorted(s_unsupportedU85), "list must be sorted");
-
-
-// Short query
-static constexpr std::pair<OpType, QueryResult> s_shortU85[] = {
-    {OpType::Transpose, QueryResult::Native},
+// TODO: This table is from the EthosU55/U65 Embedded NPU Interface Specification, it's not completely valid for
+// Ethos U85 since the allowed data types depend on ifm/ofm as well as selected acc and scaling.
+static const std::unordered_map<EthosU85NpuOp, std::unordered_map<DataType, std::vector<DataType>>> s_opDataTypeSupport = {
+    {EthosU85NpuOp::Convolution,
+        {
+            {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
+            {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
+            {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
+        }},
+    {EthosU85NpuOp::Depthwise,
+        {
+            {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
+            {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
+            {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
+        }},
+    {EthosU85NpuOp::VectorProduct,
+        {
+            {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
+            {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
+            {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
+        }},
+    {EthosU85NpuOp::Pooling,
+        {
+            {DataType::Bool8, {DataType::Bool8, DataType::Int32, DataType::Int64}},
+            {DataType::UInt8, {DataType::UInt8, DataType::Int32, DataType::Int64}},
+            {DataType::Int8, {DataType::Int8, DataType::Int32, DataType::Int64}},
+            {DataType::Int16, {DataType::Int16}},
+        }},
+    {EthosU85NpuOp::ReduceMinMax,
+        {
+            {DataType::Bool8, {DataType::Bool8, DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+            {DataType::UInt8, {DataType::Bool8, DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+            {DataType::Int8, {DataType::Bool8, DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+            {DataType::Int16, {DataType::Bool8, DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+            {DataType::Int32, {DataType::Bool8, DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+        }},
+    {EthosU85NpuOp::ReduceSum,
+        {
+            {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+            {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+            {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+            {DataType::Int32, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+        }},
+    {EthosU85NpuOp::ArgMax,
+        {
+            {DataType::Bool8, {DataType::Int32, DataType::Int64}},
+            {DataType::UInt8, {DataType::Int32, DataType::Int64}},
+            {DataType::Int8, {DataType::Int32, DataType::Int64}},
+            {DataType::Int16, {DataType::Int32, DataType::Int64}},
+        }},
+    {EthosU85NpuOp::Resize,
+        {
+            {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
+            {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
+            {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
+        }},
 };
 
-static_assert(is_sorted(s_shortU85, [](const auto &a, const auto &b) { return a.first < b.first; }), "list must be sorted");
-
 TransposeSupport EthosU85Constraints::SupportsFusedTranspose(OpType opType, TransposeType transposeType)
 {
     if ( transposeType == TransposeType::None ) return TransposeSupport::Any;
@@ -162,11 +209,68 @@ bool EthosU85Constraints::SupportsRescale(DataType fromType, DataType toType)
     return fromType != DataType::UInt16;
 }
 
+bool EthosU85Constraints::SupportedDtypes(OpType opType, DataType ifmType, DataType ifm2Type, DataType ofmType)
+{
+    auto npuOp = _arch->GetHWOp(opType);
+    if ( IsFloat(ifmType | ifm2Type | ofmType) )
+    {
+        return false;
+    }
+
+    if ( _arch->UseAvgPoolNop(opType) )
+    {
+        // The rules for UseAvgPoolNop are not the same as for a Pooling operation, skip checks for now
+        return true;
+    }
+
+    if ( npuOp != EthosU85NpuOp::Elementwise )
+    {
+        auto map = s_opDataTypeSupport.find(npuOp);
+        if ( map == s_opDataTypeSupport.end() )
+        {
+            assert(false && "Data type mapping for HWOp missing");
+            return false;
+        }
+        auto &typeMap = map->second;
+        auto ifmEntry = typeMap.find(ifmType);
+        if ( ifmEntry == typeMap.end() )
+        {
+            // Unsupported ifm data type
+            return false;
+        }
+        auto &ofmTypes = ifmEntry->second;
+        if ( 0 == std::count(ofmTypes.begin(), ofmTypes.end(), ofmType) )
+        {
+            // Unsupported ofm data type
+            return false;
+        }
+    }
+    else
+    {
+        // TODO elementwise
+    }
+    return true;
+}
+
 Flags<QueryResult> EthosU85Constraints::OperatorQuery(OpType opType, const ArchOperatorQuery *query, ArchRequirements *req)
 {
-    // Check unsupported operator list first
-    auto posUnsupported = std::equal_range(std::begin(s_unsupportedU85), std::end(s_unsupportedU85), opType);
-    if ( posUnsupported.first != std::end(s_unsupportedU85) )
+    Flags<QueryResult> result = QueryResult::Native;
+    static constexpr int32_t MAX_AXIS = (1 << 16);
+
+    // Check hardware-required substitutions first
+    if ( (opType == OpType::Sigmoid) || (opType == OpType::Tanh) )
+    {
+        if ( req )
+        {
+            req->req = ArchRequirement::OpSubstitution;
+            req->substitution = OpType::LUT;
+        }
+        result.Set(QueryResult::HasRequirements);
+    }
+
+    // Check direct native support of the opType
+    auto npuOp = _arch->GetHWOp(opType);
+    if ( npuOp == EthosU85NpuOp::None )
     {
         return QueryResult::Unsupported;
     }
@@ -174,52 +278,151 @@ Flags<QueryResult> EthosU85Constraints::OperatorQuery(OpType opType, const ArchO
     // Short query (no additional detail)
     if ( !query )
     {
-        auto posShort = std::equal_range(std::begin(s_shortU85), std::end(s_shortU85),
-            std::pair<OpType, QueryResult>{opType, {}}, [](const auto &a, const auto &b) { return a.first < b.first; });
-        if ( posShort.first != std::end(s_shortU85) )
+        // more detailed query might fail
+        return QueryResult::NativeConstrained;
+    }
+
+    // Fusing checks
+    if ( query->transposeMask != TransposeType::None )
+    {
+        TransposeSupport tmp = SupportsFusedTranspose(opType, query->transposeMask);
+        if ( tmp == TransposeSupport::None )
+        {
+            if ( opType == OpType::Transpose )
+            {
+                // unsupported mask for standalone transpose, requires decomposition
+                if ( req )
+                {
+                    req->req.Set(ArchRequirement::Decompose);
+                    req->decomposeProps.Set(ArchProperty::TransposeMask);
+                    result.Set(QueryResult::HasRequirements);
+                }
+            }
+            else
+            {
+                // unsupported transpose-fusing
+                return QueryResult::Unsupported;
+            }
+        }
+    }
+    if ( query->reverseMask != ReverseType::None )
+    {
+        if ( !SupportsFusedReverse(opType, query->reverseMask) )
         {
-            return posShort.first->second;
+            return QueryResult::Unsupported;
         }
-        return QueryResult::Native;
     }
 
-    // Float types always unsupported
-    if ( (query->ifm[0].shape && IsFloat(query->ifm[0].type)) || (query->ifm[1].shape && IsFloat(query->ifm[1].type)) ||
-         (query->ofm.shape && IsFloat(query->ofm.type)) )
+    if ( npuOp == EthosU85NpuOp::Dma )
     {
-        return QueryResult::Unsupported;
+        return result;
     }
 
-    if ( query->transposeMask != TransposeType::None )
+    const auto &ifmShape = query->ifm[0].shape;
+    const auto &ifm2Shape = query->ifm[1].shape;
+    const auto &ofmShape = query->ofm.shape;
+    bool typeInfo = (query->ifm[0].type != DataType::None && query->ofm.type != DataType::None);
+    bool shapeInfo = (ifmShape && ofmShape);
+
+    if ( !typeInfo || !shapeInfo || !query->kernel )
     {
-        TransposeSupport tmp = SupportsFusedTranspose(opType, query->transposeMask);
-        if ( tmp == TransposeSupport::None ) return QueryResult::Unsupported;
+        // missing detail, more detailed queries might fail
+        result.Set(QueryResult::Constrained);
     }
 
-    if ( query->reverseMask != ReverseType::None )
+    // Validate dataTypes
+    if ( typeInfo && !SupportedDtypes(opType, query->ifm[0].type, query->ifm[1].type, query->ofm.type) )
     {
-        if ( !SupportsFusedReverse(opType, query->reverseMask) ) return QueryResult::Unsupported;
+        return QueryResult::Unsupported;
     }
 
-    // Operator specific
-    if ( (opType == OpType::Sigmoid) || (opType == OpType::Tanh) )
+    // Validate tensor-shapes
+    if ( shapeInfo )
     {
-        if ( req )
+        for ( const auto &s : {ifmShape, ifm2Shape, ofmShape} )
         {
-            req->req = ArchRequirement::OpSubstitution;
-            req->substitution = OpType::LUT;
+            if ( !s ) continue;
+            auto shape = Shape::PadAxes(s, 4, 1);
+            // validate that leading dimensions are unit
+            for ( int i = 0; i < shape.Size() - 3; i++ )
+            {
+                if ( shape[i] > 1 )
+                {
+                    if ( req )
+                    {
+                        req->req.Set(ArchRequirement::Decompose);
+                        req->decomposeProps.Set(ArchProperty::TensorDims);
+                    }
+                    result.Set(QueryResult::HasRequirements);
+                }
+            }
+            // validate that HWC are within valid range
+            for ( int i = shape.Size() - 3; i < shape.Size(); i++ )
+            {
+                if ( shape[i] > MAX_AXIS )
+                {
+                    if ( req )
+                    {
+                        req->req.Set(ArchRequirement::Decompose);
+                        req->decomposeProps.Set(ArchProperty::TensorAxis);
+                    }
+                    result.Set(QueryResult::HasRequirements);
+                }
+            }
+        }
+    }
+
+    // Detailed operator queries
+    if ( opType == OpType::MatMul )
+    {
+        // Constrain Matmul height to 1
+        if ( ofmShape.Size() > 2 && ofmShape.Height() > 1 )
+        {
+            if ( req )
+            {
+                req->req.Set(ArchRequirement::Decompose);
+                req->decomposeProps.Set(ArchProperty::TensorAxis);
+            }
+            result.Set(QueryResult::HasRequirements);
         }
-        return QueryResult::NativeHasReq;
     }
-    else if ( opType == OpType::MatMul )
+
+    // kernel constraint-checks
+    if ( query->kernel )
     {
-        if ( (query->ofm.shape.Size() >= 2) && query->ofm.shape.Elements() > query->ofm.shape.ElementsWC() )
+        auto k = query->kernel;
+        if ( k->Stride().x > 3 || k->Stride().y > 3 )
         {
-            return QueryResult::NativeDecompose;
+            if ( req )
+            {
+                req->req.Set(ArchRequirement::Decompose);
+                req->decomposeProps.Set(ArchProperty::KernelStride);
+            }
+            result.Set(QueryResult::HasRequirements);
+        }
+
+        if ( k->Dilation().x > 2 || k->Dilation().y > 2 )
+        {
+            if ( req )
+            {
+                req->req.Set(ArchRequirement::Decompose);
+                req->decomposeProps.Set(ArchProperty::KernelDilation);
+            }
+            result.Set(QueryResult::HasRequirements);
+        }
+
+        if ( k->DepthMultiplier() > 1 )
+        {
+            if ( req )
+            {
+                req->req.Set(ArchRequirement::Decompose);
+                req->decomposeProps.Set(ArchProperty::DepthMultiplier);
+            }
+            result.Set(QueryResult::HasRequirements);
         }
     }
 
-    return QueryResult::Native;
+    return result;
 }
 
 }  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.hpp
index c193d6d3f583d815ec8bc0e999ee0c5a2f0757a6..b682369832c2ea2f1322f5da63dbe6db9f6c4725 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.hpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.hpp
@@ -38,6 +38,9 @@ public:
     bool SupportsElementwiseLeakyRelu(bool quantized, DataType type) override { return true; };
     bool SupportsRescale(DataType fromType, DataType toType) override;
     Flags<QueryResult> OperatorQuery(OpType opType, const ArchOperatorQuery *query, ArchRequirements *req) override;
+
+protected:
+    bool SupportedDtypes(OpType opType, DataType ifmType, DataType ifm2Type, DataType ofmType) override;
 };
 
 }  // namespace regor
diff --git a/ethosu/regor/compiler/graphir_optimiser.cpp b/ethosu/regor/compiler/graphir_optimiser.cpp
index 7d0c68d8f133b06ee76e80862deb0618c416d934..962e86aa8a46bc9889888c8d15c40148b3f8ba3a 100644
--- a/ethosu/regor/compiler/graphir_optimiser.cpp
+++ b/ethosu/regor/compiler/graphir_optimiser.cpp
@@ -1627,6 +1627,7 @@ Operation *GraphIrOptimiser::MergeTransposes(Graph *const graph, Operation *cons
 
             // Can't merge if both apply quantization
             bool prevHasQuant = prevConn->quantization.IsValid() && !prevConn->quantization.IsUnitScale();
+
             if ( opHasQuant && prevHasQuant ) return returnOp;
 
             // Examine previous op's transpose
@@ -1643,16 +1644,22 @@ Operation *GraphIrOptimiser::MergeTransposes(Graph *const graph, Operation *cons
             TransposeType mergedTranspose = TransposeTypeFromShape(finalMapping);
 
             ArchOperatorQuery query;
+            ArchRequirements req;
             query.transposeMask = mergedTranspose;
-            if ( _constraints->OperatorQuery(OpType::Transpose, &query, nullptr).Any(QueryResult::Native) )
+            if ( _constraints->OperatorQuery(OpType::Transpose, &query, &req).Any(QueryResult::Native) )
             {
-                // Change the transpose attribute on the preceding transpose and remove this one
-                prevAttr->perm = finalMapping;
-                TensorConnection &newConn = prevOp->ConnectOutput(TensorUsage::OFM, ofm);
-                newConn.Set(ofmConn->slice).Set(ofmConn->reverse).Set(ofmConn->shape);
-                if ( !prevHasQuant && opHasQuant ) newConn.Set(ofmConn->quantization);
-                operation->Disconnect();
-                return prevOp;
+                // only merge the transpose if the new mask is natively supported
+                // without mask-decomp
+                if ( !req.decomposeProps.Any(ArchProperty::TransposeMask) )
+                {
+                    // Change the transpose attribute on the preceding transpose and remove this one
+                    prevAttr->perm = finalMapping;
+                    TensorConnection &newConn = prevOp->ConnectOutput(TensorUsage::OFM, ofm);
+                    newConn.Set(ofmConn->slice).Set(ofmConn->reverse).Set(ofmConn->shape);
+                    if ( !prevHasQuant && opHasQuant ) newConn.Set(ofmConn->quantization);
+                    operation->Disconnect();
+                    return prevOp;
+                }
             }
         }
     }
diff --git a/ethosu/regor/compiler/scheduler_decompose.cpp b/ethosu/regor/compiler/scheduler_decompose.cpp
index bd02b2417ab76c00e73fec5068b119e376cd7e52..836f590deaed73b5763abf8a46037a202b662353 100644
--- a/ethosu/regor/compiler/scheduler_decompose.cpp
+++ b/ethosu/regor/compiler/scheduler_decompose.cpp
@@ -30,9 +30,22 @@ namespace regor
 
 static constexpr int MAX_DIM = 65536;
 
-bool NeedsDecompose(Architecture *arch, const SchedulerOperation *schedOp)
+Flags<QueryResult> OperatorQuery(Architecture *arch, const SchedulerOperation *schedOp, ArchRequirements *req)
+{
+    ArchOperatorQuery query{};
+    const SchedulerConnection *ofmConn = schedOp->OFM();
+    Set(query.ifm[0], schedOp->IFM(0));
+    Set(query.ifm[1], schedOp->TryIFM(1));
+    Set(query.ofm, ofmConn);
+    query.transposeMask = ofmConn->transpose;
+    query.reverseMask = ofmConn->reverse;
+    query.kernel = schedOp->Kernel();
+    return arch->Constraints()->OperatorQuery(schedOp->Type(), &query, req);
+}
+
+bool ShouldDecompose(Architecture *arch, const SchedulerOperation *schedOp)
 {
-    return CanDecompose(arch, schedOp) && !CanRunOnHardware(arch, schedOp);
+    return CanDecompose(arch, schedOp) && NeedsDecompose(arch, schedOp);
 }
 
 static std::unique_ptr<SchedulerOperation> MakeMemCopy(const std::shared_ptr<SchedulerTensor> &source,
@@ -193,70 +206,27 @@ static std::unique_ptr<ArchitectureOpConfig> GetOpConfig(Architecture *arch, con
     return arch->GetOpConfig(schedOp->Type(), qConfig);
 }
 
-bool CanRunOnHardware(Architecture *arch, const SchedulerOperation *schedOp)
+bool NeedsDecompose(Architecture *arch, const SchedulerOperation *schedOp)
 {
-    regor::ArchitectureOpGroupQuery qOpGroup{};
-    if ( DecomposeAsElementwise(schedOp->Type()) || schedOp->Type() == OpType::MemoryCopy )
+    ArchRequirements req{};
+    Flags<QueryResult> result = OperatorQuery(arch, schedOp, &req);
+    // Assert complete query
+    assert(result.Any(QueryResult::Constrained) == false && "Constrained result from complete OperatorQuery");
+    if ( result.Any(QueryResult::Unsupported) )
     {
-        auto &ofmShape = schedOp->OFM()->SliceShape();
-        if ( ofmShape.Size() > 3 && ofmShape.Elements() > ofmShape.Width() * ofmShape.Height() * ofmShape.Depth() )
-            return false;
+        // Operations completely unsupported by HW should not be decomposed
+        return false;
     }
-    if ( schedOp->Type() == OpType::MatMul )
+    if ( result.Any(QueryResult::HasRequirements) )
     {
-        const auto ofmConn = schedOp->OFM();
-        ArchOperatorQuery query;
-        Set(query.ifm[0], schedOp->IFM(0));
-        Set(query.ifm[1], schedOp->IFM(1));
-        Set(query.ofm, ofmConn);
-        query.transposeMask = ofmConn->transpose;
-        if ( (arch->Constraints()->OperatorQuery(OpType::MatMul, &query, nullptr) & QueryResult::NativeDecompose) != QueryResult::Native )
+        if ( req.req.Any(ArchRequirement::Decompose) )
         {
-            return false;
+            return true;
         }
+        // Has requirements but not decomposition-related
     }
-    if ( IsConvolution(schedOp->Type()) || IsPooling(schedOp->Type()) )
-    {
-        auto &ofmShape = schedOp->OFM()->SliceShape();
-        if ( ofmShape.Size() > 3 && ofmShape.Batch() > 1 ) return false;
-    }
-    if ( schedOp->Type() == OpType::Transpose )
-    {
-        auto &ifmShape = schedOp->IFM(0)->SliceShape();
-        if ( ifmShape.Size() > 3 && ifmShape.Elements() > ifmShape.Width() * ifmShape.Height() * ifmShape.Depth() )
-            return false;
-        auto &ofmShape = schedOp->OFM()->SliceShape();
-        if ( ofmShape.Size() > 3 && ofmShape.Elements() > ofmShape.Width() * ofmShape.Height() * ofmShape.Depth() )
-            return false;
-
-        ArchOperatorQuery query;
-        query.transposeMask = schedOp->OFM()->transpose;
-        if ( !arch->Constraints()->OperatorQuery(OpType::Transpose, &query, nullptr).Any(QueryResult::Native) )
-        {
-            return false;
-        }
-    }
-    auto *ifm = schedOp->TryIFM(0);
-    auto *ifm2 = schedOp->TryIFM(1);
-    auto *ofm = schedOp->TryOFM();
-    if ( !ifm || !ofm ) return false;
-    qOpGroup.type = schedOp->Type();
-    qOpGroup.kernel = schedOp->Kernel();
-    qOpGroup.ifm[0].key = ifm->tensor->uid;
-    qOpGroup.ifm[0].type = ifm->tensor->dataType;
-    qOpGroup.ifm[0].shape = ifm->SliceShape();
-    if ( ifm2 )
-    {
-        qOpGroup.ifm[1].key = ifm2->tensor->uid;
-        qOpGroup.ifm[1].type = ifm2->tensor->dataType;
-        qOpGroup.ifm[1].shape = ifm2->SliceShape();
-    }
-    qOpGroup.ofm.key = ofm->tensor->uid;
-    qOpGroup.ofm.type = ofm->tensor->dataType;
-    qOpGroup.ofm.shape = ofm->SliceShape();
-    qOpGroup.ofm.transpose = ofm->transpose;
-    if ( arch->CreateOpGroup(qOpGroup) == nullptr ) return false;
-    return GetOpConfig(arch, schedOp) != nullptr;
+    // no opconfig requires decomposition
+    return !GetOpConfig(arch, schedOp);
 }
 
 bool CanDecompose(Architecture *, const SchedulerOperation *schedOp)
@@ -810,7 +780,7 @@ std::vector<std::unique_ptr<SchedulerOperation>> DecomposeConv2D(Architecture *a
     {
         return DecomposeLeadingDimensions(1, arch, std::move(op), DecomposeConv2D);
     }
-    if ( CanRunOnHardware(arch, op.get()) )
+    if ( !NeedsDecompose(arch, op.get()) )
     {
         UpdatePaddingAndIfmOffset(op.get());
         result.emplace_back(std::move(op));
@@ -1128,7 +1098,7 @@ std::vector<std::unique_ptr<SchedulerOperation>> DecomposeDepthwiseConv2D(Archit
         return result;
     }
 
-    if ( CanRunOnHardware(arch, op.get()) )
+    if ( !NeedsDecompose(arch, op.get()) )
     {
         UpdatePaddingAndIfmOffset(op.get());
         result.emplace_back(std::move(op));
@@ -1621,12 +1591,106 @@ std::vector<std::unique_ptr<SchedulerOperation>> DecomposeTranspose(Architecture
     const auto &ifmShape = ifmConn->SliceShape();
     const auto axes = ifmShape.Size();
 
-    ArchOperatorQuery query;
-    query.transposeMask = ofmConn->transpose;
-    bool supported = arch->Constraints()->OperatorQuery(OpType::Transpose, &query, nullptr).Any(QueryResult::Native);
+    auto req = ArchRequirements();
+    auto qResult = OperatorQuery(arch, op.get(), &req);
+    bool decomposeMask = false;
+    bool decomposeAxes = false;
+    bool decomposeLeadingDims = false;
+
+    if ( qResult.Any(QueryResult::HasRequirements) && req.req.Any(ArchRequirement::Decompose) )
+    {
+        decomposeMask = req.decomposeProps.Any(ArchProperty::TransposeMask);
+        decomposeAxes = req.decomposeProps.Any(ArchProperty::TensorAxis);
+        decomposeLeadingDims = req.decomposeProps.Any(ArchProperty::TensorDims);
+    }
+
+    if ( decomposeMask || decomposeLeadingDims )
+    {
+        // Decompose unsupported transpose-masks or large IFM-dimensions
+        // by unrolling the transpose-mask into many 3D-transpose operations.
+
+        // We can handle TransposeType::None as an elementwise, because it's basically a memory copy
+        if ( ofmConn->transpose == TransposeType::None )
+        {
+            LOG_TRACE1("DecomposeTranspose: Decomposing as elementwise\n");
+            return DecomposeElementwise(arch, std::move(op));
+        }
+
+        assert(ifmConn->slice.offset.IsEmpty() && ifmConn->slice.shape.IsEmpty());
+        assert(ofmConn->slice.offset.IsEmpty() && ofmConn->slice.shape.IsEmpty());
+
+        // Decompose a transpose by peforming a selection sort of the axes. Each swap in the selection sort algorithm
+        // expands to one or more transpose ops.
+        //
+        // Example:
+        //
+        // Input shape:        [ 3,  7, 11, 13]
+        // Permutation vector: [ 1,  3,  0,  2]
+        // Sort order:         [ 2,  0,  3,  1]
+        // Output shape:       [ 7, 13,  3, 11]
+        //
+        // Selection sort swaps:
+        //
+        // Swap 1: Pos 0 <-> Pos 1: [7, 3,  11, 13]
+        // Swap 2: Pos 1 <-> Pos 3: [7, 13, 11,  3]
+        // Swap 3: Pos 2 <-> Pos 3: [7, 13,  3, 11]
+
+        // Calculate sort order
+        Shape order(nullptr, axes);
+        uint32_t mask = uint32_t(ofmConn->transpose);
+        for ( int i = axes - 1; i >= 0; i-- )
+        {
+            const int pos = axes - 1 - (mask & 0xF);
+            order[pos] = i;
+            mask = mask >> 4;
+        }
+
+        auto shape = ifmConn->shape;
+
+        LOG_TRACE1("DecomposeTranspose: Sort order ({})\n", order.ToString());
+        LOG_TRACE1("DecomposeTranspose: Initial shape ({})\n", shape.ToString());
+
+        for ( int axis = 0; axis < axes; axis++ )
+        {
+            // Check if axis is already in the right place
+            if ( order[axis] == axis ) continue;
+
+            // Find where the axis is
+            int i;
+            for ( i = axis + 1; i < axes; i++ )
+                if ( order[i] == axis ) break;
+            assert(i < axes);
+
+            // Move axis to right place
+            LOG_TRACE1("DecomposeTranspose: Swap {} <-> {}\n", axis, i);
+            auto tail = !result.empty() ? result.back()->OFM() : ifmConn;
+            auto subOps = SwapAxes(arch, shape, tail, axis, i);
+            result.insert(result.end(), std::make_move_iterator(subOps.begin()), std::make_move_iterator(subOps.end()));
+            std::swap(order[axis], order[i]);
+            LOG_TRACE1("DecomposeTranspose: Shape is now ({})\n", shape.ToString());
+        }
+
+        LOG_TRACE1("DecomposeTranspose: Final shape ({})\n", shape.ToString());
+
+        assert(!result.empty());
+
+        const auto &lastTensor = result.back()->OFM()->tensor;
+        for ( auto &subOp : result )
+        {
+            auto ofm = subOp->OFM();
+            if ( ofm->tensor == lastTensor )
+            {
+                // Adjust to that last output is written to the original OFM
+                ofm->tensor = ofmConn->tensor;
+                ofm->tensor->producers.push_back(subOp.get());
+                ofm->quantization = ofmConn->quantization;
+            }
+        }
+        return result;
+    }
 
     // We can handle all transpositions in a 3D shape
-    if ( (axes < 4 || ifmShape.Elements() == ifmShape.Height() * ifmShape.Width() * ifmShape.Depth()) && supported )
+    if ( decomposeAxes )
     {
         for ( int axis = 0; axis < axes; axis++ )
         {
@@ -1641,90 +1705,11 @@ std::vector<std::unique_ptr<SchedulerOperation>> DecomposeTranspose(Architecture
                 return DecomposeLargeAxis(axis, MAX_DIM, arch, std::move(op), DecomposeTranspose);
             }
         }
-
-        // No decomposition required
-        result.push_back(std::move(op));
-        return result;
-    }
-
-    // We can handle TransposeType::None as an elementwise, because it's basically a memory copy
-    if ( ofmConn->transpose == TransposeType::None )
-    {
-        LOG_TRACE1("DecomposeTranspose: Decomposing as elementwise\n");
-        return DecomposeElementwise(arch, std::move(op));
     }
 
-    assert(ifmConn->slice.offset.IsEmpty() && ifmConn->slice.shape.IsEmpty());
-    assert(ofmConn->slice.offset.IsEmpty() && ofmConn->slice.shape.IsEmpty());
-
-    // Decompose a transpose by peforming a selection sort of the axes. Each swap in the selection sort algorithm
-    // expands to one or more transpose ops.
-    //
-    // Example:
-    //
-    // Input shape:        [ 3,  7, 11, 13]
-    // Permutation vector: [ 1,  3,  0,  2]
-    // Sort order:         [ 2,  0,  3,  1]
-    // Output shape:       [ 7, 13,  3, 11]
-    //
-    // Selection sort swaps:
-    //
-    // Swap 1: Pos 0 <-> Pos 1: [7, 3,  11, 13]
-    // Swap 2: Pos 1 <-> Pos 3: [7, 13, 11,  3]
-    // Swap 3: Pos 2 <-> Pos 3: [7, 13,  3, 11]
-
-    // Calculate sort order
-    Shape order(nullptr, axes);
-    uint32_t mask = uint32_t(ofmConn->transpose);
-    for ( int i = axes - 1; i >= 0; i-- )
-    {
-        const int pos = axes - 1 - (mask & 0xF);
-        order[pos] = i;
-        mask = mask >> 4;
-    }
-
-    auto shape = ifmConn->shape;
-
-    LOG_TRACE1("DecomposeTranspose: Sort order ({})\n", order.ToString());
-    LOG_TRACE1("DecomposeTranspose: Initial shape ({})\n", shape.ToString());
-
-    for ( int axis = 0; axis < axes; axis++ )
-    {
-        // Check if axis is already in the right place
-        if ( order[axis] == axis ) continue;
-
-        // Find where the axis is
-        int i;
-        for ( i = axis + 1; i < axes; i++ )
-            if ( order[i] == axis ) break;
-        assert(i < axes);
-
-        // Move axis to right place
-        LOG_TRACE1("DecomposeTranspose: Swap {} <-> {}\n", axis, i);
-        auto tail = !result.empty() ? result.back()->OFM() : ifmConn;
-        auto subOps = SwapAxes(arch, shape, tail, axis, i);
-        result.insert(result.end(), std::make_move_iterator(subOps.begin()), std::make_move_iterator(subOps.end()));
-        std::swap(order[axis], order[i]);
-        LOG_TRACE1("DecomposeTranspose: Shape is now ({})\n", shape.ToString());
-    }
-
-    LOG_TRACE1("DecomposeTranspose: Final shape ({})\n", shape.ToString());
-
-    assert(!result.empty());
-
-    const auto &lastTensor = result.back()->OFM()->tensor;
-    for ( auto &subOp : result )
-    {
-        auto ofm = subOp->OFM();
-        if ( ofm->tensor == lastTensor )
-        {
-            // Adjust to that last output is written to the original OFM
-            ofm->tensor = ofmConn->tensor;
-            ofm->tensor->producers.push_back(subOp.get());
-            ofm->quantization = ofmConn->quantization;
-        }
-    }
 
+    // No decomposition required
+    result.push_back(std::move(op));
     return result;
 }
 
diff --git a/ethosu/regor/compiler/scheduler_decompose.hpp b/ethosu/regor/compiler/scheduler_decompose.hpp
index 20c392a70ea53dbf6a67e32331dfdc7461de48ce..8853c7979ade75de9f4f26d720f9509237b25847 100644
--- a/ethosu/regor/compiler/scheduler_decompose.hpp
+++ b/ethosu/regor/compiler/scheduler_decompose.hpp
@@ -31,9 +31,9 @@ class DecompositionFailure : public std::runtime_error
 public:
     DecompositionFailure(const std::string &what = "") : std::runtime_error(what) {}
 };
-
+Flags<QueryResult> OperatorQuery(Architecture *arch, const SchedulerOperation *schedOp, ArchRequirements *req);
+bool ShouldDecompose(Architecture *arch, const SchedulerOperation *schedOp);
 bool NeedsDecompose(Architecture *arch, const SchedulerOperation *schedOp);
-bool CanRunOnHardware(Architecture *arch, const SchedulerOperation *schedOp);
 bool CanDecompose(Architecture *arch, const SchedulerOperation *schedOp);
 std::vector<std::unique_ptr<SchedulerOperation>> DecomposeConv2D(Architecture *arch, std::unique_ptr<SchedulerOperation> op);
 std::vector<std::unique_ptr<SchedulerOperation>> DecomposeConv3D(Architecture *arch, std::unique_ptr<SchedulerOperation> op);
@@ -53,7 +53,7 @@ inline ArchFM &Set(ArchFM &fm, const SchedulerConnection *conn)
     if ( conn )
     {
         fm.type = conn->tensor->dataType;
-        fm.shape = conn->slice.shape ? conn->slice.shape : conn->shape;
+        fm.shape = conn->SliceShape();
         fm.format = conn->tensor->format;
     }
     return fm;
diff --git a/ethosu/regor/compiler/scheduler_packing.cpp b/ethosu/regor/compiler/scheduler_packing.cpp
index 0007053b0bf7a2788a0ed368dffc2ceb8d1d9e18..e0942de450611a3c790474522d082e3fd62c8d36 100644
--- a/ethosu/regor/compiler/scheduler_packing.cpp
+++ b/ethosu/regor/compiler/scheduler_packing.cpp
@@ -105,7 +105,7 @@ bool IsConnected(const SchedulerOperation &first, const SchedulerOperation &seco
 }  // namespace
 
 SchedulerPacking::SchedulerPacking(Architecture *arch, bool disableChaining) :
-        _arch(arch), _disableChaining(disableChaining)
+        _arch(arch), _constraints(arch->Constraints()), _disableChaining(disableChaining)
 {
 }
 
@@ -122,6 +122,8 @@ std::vector<std::unique_ptr<SchedulerOperation>> SchedulerPacking::Process(const
 
     FilterOperations(executionList, graph);
 
+    PrePackOperations();
+
     PackOperations();
 
     ReorderOperations();
@@ -136,7 +138,7 @@ void SchedulerPacking::FilterOperations(const std::vector<Operation *> &executio
     {
         auto schedOp = MakeSchedulerOperation(op, graph);
 
-        if ( NeedsDecompose(_arch, schedOp.get()) )
+        if ( ShouldDecompose(_arch, schedOp.get()) )
         {
             auto schedOps = DecomposeSchedulerOperation(std::move(schedOp));
             _schedList.insert(
@@ -183,6 +185,35 @@ ArchitectureOpGroupQuery SchedulerPacking::CreateOpGroupQuery(const SchedulerOpe
     return query;
 }
 
+void SchedulerPacking::SchedulerPacking::PrePackOperations()
+{
+    // Determine if each operation can run on NPU
+    for ( auto &schedOp : _schedList )
+    {
+        ArchRequirements oReq{};
+        Flags<QueryResult> result = OperatorQuery(_arch, schedOp.get(), &oReq);
+        // Assert complete query
+        assert(result.Any(QueryResult::Constrained) == false && "Constrained result from complete OperatorQuery");
+        if ( result.Any(QueryResult::Native) )
+        {
+            // TODO MLBEDSW-10643: This should be a direct-check against QueryResult::Native
+            // HasRequirements at this point should result in CPU-fallback
+            if ( result.Any(QueryResult::HasRequirements) && oReq.req.Any(ArchRequirement::Decompose) )
+            {
+                schedOp->SetNpuOp(false);
+            }
+            else
+            {
+                schedOp->SetNpuOp(true);
+            }
+        }
+        else
+        {
+            schedOp->SetNpuOp(false);
+        }
+    }
+}
+
 void SchedulerPacking::SchedulerPacking::PackOperations()
 {
     LOG_TRACE1("Scheduler Packing (of {0} Ops)\n", _schedList.size());
@@ -203,17 +234,14 @@ void SchedulerPacking::SchedulerPacking::PackOperations()
 
         cur++;
 
-        LOG_TRACE1("Creating new group with {}\n", OpTypeToString(primaryOp->Type()));
-
-        auto op0 = CreateOpGroupQuery(primaryOp);
-
-        // Try to create OpGroup
-        auto group = _arch->CreateOpGroup(op0);
-
         // OpGroup is nullptr if op can't run on NPU
-        if ( group )
+        if ( primaryOp->IsNpuOp() )
         {
-            primaryOp->SetNpuOp(true);
+            LOG_TRACE1("Creating new group with {}\n", OpTypeToString(primaryOp->Type()));
+            auto op0 = CreateOpGroupQuery(primaryOp);
+            // Try to create OpGroup
+            auto group = _arch->CreateOpGroup(op0);
+            assert(group);
 
             // First op in group has key 0
             int prevOpKey = 0;
@@ -238,7 +266,6 @@ void SchedulerPacking::SchedulerPacking::PackOperations()
                     LOG_TRACE1("Can't add next op\n");
                     break;
                 }
-                nextOp->SetNpuOp(true);
                 nextOp->SetParent(primaryOp);
                 nextOp->SetOpGroupKey(key);
 
@@ -384,6 +411,12 @@ int SchedulerPacking::CanPack(const SchedulerOperation *schedOp, const Scheduler
     assert(prevOFM && "primary/prev op must have OFM");
     assert(ifmTensor && "next op must have IFM");
 
+    // can't pack CPU operations
+    if ( !schedOp->IsNpuOp() )
+    {
+        return 0;
+    }
+
     // Previous op in execution order doesn't connect to this one
     if ( prevOFM != ifmTensor && prevOFM != ifm2Tensor )
     {
@@ -548,6 +581,7 @@ std::unique_ptr<SchedulerOperation> SchedulerPacking::MakeSchedulerOperation(Ope
     Set(query.ofm, ofmConn);
     query.reverseMask = ofmConn->reverse;
     query.transposeMask = ofmConn->transpose;
+    query.kernel = schedOp->Kernel();
 
     ArchRequirements req;
     if ( _arch->Constraints()->OperatorQuery(op->Type(), &query, &req).Any(QueryResult::Native) )
diff --git a/ethosu/regor/compiler/scheduler_packing.hpp b/ethosu/regor/compiler/scheduler_packing.hpp
index 78abbc139ec4dafe11a820ba5ab056a2636bc0ae..4062185168d0be15a55741e3f0f519b7049a757d 100644
--- a/ethosu/regor/compiler/scheduler_packing.hpp
+++ b/ethosu/regor/compiler/scheduler_packing.hpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2021-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -21,6 +21,7 @@
 #include "common/common.hpp"
 #include "common/logging.hpp"
 
+#include "architecture/architecture_constraints.hpp"
 #include "common/shape.hpp"
 #include "graph.hpp"
 #include "operation.hpp"
@@ -45,6 +46,7 @@ class SchedulerPacking
 {
 protected:
     Architecture *_arch = nullptr;
+    IArchitectureConstraints *_constraints = nullptr;
     bool _disableChaining = false;
     std::vector<std::unique_ptr<SchedulerOperation>> _schedList;
     std::unordered_map<Tensor *, std::shared_ptr<SchedulerTensor>> _tensorMap;
@@ -56,8 +58,13 @@ public:
     std::vector<std::unique_ptr<SchedulerOperation>> Process(const Graph *graph);
 
 private:
+    // Decomposes operations
     void FilterOperations(const std::vector<Operation *> &executionList, const Graph *graph);
+    // Determines NPU/CPU-target
+    void PrePackOperations();
+    // Performs fusing/chaining
     void PackOperations();
+    // Reorders CPU-operations
     void ReorderOperations();
 
     int CanPack(const SchedulerOperation *schedOp, const SchedulerOperation *prevOp, const SchedulerOperation *op, const int prevOpKey) const;
@@ -67,6 +74,7 @@ private:
     std::vector<std::unique_ptr<SchedulerOperation>> DecomposeSchedulerOperation(std::unique_ptr<SchedulerOperation> op);
     ArchResampling ResamplingMode(TensorUsage usage, OpType opType) const;
     ArchitectureOpGroupQuery CreateOpGroupQuery(const SchedulerOperation *schedOp) const;
+    ArchOperatorQuery CreateOperatorQuery(const SchedulerOperation *schedOp) const;
 };
 
 }  // namespace regor